Thursday, July 14, 2011

Setting Up a Hadoop Cluster

This post lists the steps to set up an Hadoop cluster in Ubuntu 11.04. Most codes can be directly copied and pasted.

* Hadoop
** Install Java
#+begin_src shell
sudo apt-get install sun-java6-jdk
sudo update-java-alternatives -s java-6-sun
#+end_src

** Add Hadoop User and Group
#+begin_src shell
sudo addgroup hadoop
sudo adduser --ingroup hadoop hadoop
#+end_src

** Configuring SSH and Password-less Login
#+begin_src sh
  # In the master node
  su hadoop
  ssh-keygen -t rsa -P ""
 
  for node in $(cat /conf/slaves);
  do
      ssh-copy-id -i $HOME/.ssh/id_rsa.pub hadoop@$node;
  done
#+end_src

** Install Hadoop
*** Install
#+begin_src sh
  ## download and install
  cd /home/hadoop/
  tar xzf hadoop-0.21.0.tar.gz
  mv hadoop-0.21.0 hadoop
#+end_src
*** Update .bashrc
#+begin_src sh
  ## update .bashrc
  # Set Hadoop-related environment variables
  export HADOOP_HOME=/home/hadoop/hadoop
  export HADOOP_COMMON_HOME="/home/hadoop/hadoop"
  export PATH=$PATH:$HADOOP_HOME/bin
  export PATH=$PATH:$HADOOP_COMMON_HOME/bin/
#+end_src
*** Update conf/hadoop-env.sh
#+begin_src sh
  export JAVA_HOME=/usr/lib/jvm/java-6-sun
  export HADOOP_OPTS=-Djava.net.preferIPv4Stack=true
#+end_src
*** Update conf/core-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>

<!-- In: conf/core-site.xml -->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/tmp</value>
<description>A base for other temporary directories.</description>
</property>

<property>
<name>fs.default.name</name>
<value>hdfs://128.125.86.89:54310</value>
<description>The name of the default file system. A URI whose
scheme and authority determine the FileSystem implementation. The
uri's scheme determines the config property (fs.SCHEME.impl) naming
the FileSystem implementation class. The uri's authority is used to
determine the host, port, etc. for a filesystem.</description>
</property>


</configuration>
*** Update conf/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

<!-- In: conf/mapred-site.xml -->
<property>
<name>mapreduce.jobtracker.address</name>
<value>128.125.86.89:54311</value>
</property>

</configuration>
*** Update conf/hdfs-site.xml
#+begin_src html
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>

<!-- In: conf/hdfs-site.xml -->
<property>
<name>dfs.replication</name>
<value>3</value>
<description>Default block replication.
The actual number of replications can be specified when the file is created.
The default is used if replication is not specified in create time.
</description>
</property>

</configuration>
#+end_src
*** Update conf/masters (master node only)
#+begin_src sh
128.125.86.89
#+end_src
*** Update conf/slaves (master node only)
#+begin_src sh
128.125.86.89
slave-ip1
slave-ip2
......
#+end_src
*** Copy hadoop installation and configuration files to slave nodes
#+begin_src sh  
# In the master node  
su hadoop    
for node in $(cat /conf/slaves);  
do
      scp ~/.bashrc hadoop@$node:~;       scp -r ~/hadoop hadoop@#node:~;  
done
#+end_src
** Run Hadoop
*** Format HDFS
#+begin_src sh
hdfs namenode -format
#+end_src
*** Start Hadoop
#+begin_src sh
start-dfs.sh && sleep 300 && start-mapred.sh && echo "GOOD"
#+end_src
*** Run Jobs
#+begin_src sh
hadoop jar hadoop pipes
#+end_src
*** Stop Hadoop
#+begin_src sh
stop-mapred.sh && stop-dfs.sh
#+end_src
** References:
1. http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-single-node-cluster/ 
2. http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/
3. http://fclose.com/b/cloud-computing/290/hadoop-tutorial/
4. Fix could only be replicated to 0 nodes instead of 1 error