* Hadoop
** Install Java
#+begin_src shell
sudo apt-get install sun-java6-jdk
sudo update-java-alternatives -s java-6-sun
#+end_src
** Add Hadoop User and Group
#+begin_src shell
sudo addgroup hadoop
sudo adduser --ingroup hadoop hadoop
#+end_src
** Configuring SSH and Password-less Login
#+begin_src sh
# In the master node
su hadoop
ssh-keygen -t rsa -P ""
for node in $(cat /conf/slaves);
do
ssh-copy-id -i $HOME/.ssh/id_rsa.pub hadoop@$node;
done
#+end_src
** Install Hadoop
*** Install
#+begin_src sh*** Update .bashrc
## download and install
cd /home/hadoop/
tar xzf hadoop-0.21.0.tar.gz
mv hadoop-0.21.0 hadoop
#+end_src
#+begin_src sh*** Update conf/hadoop-env.sh
## update .bashrc
# Set Hadoop-related environment variables
export HADOOP_HOME=/home/hadoop/hadoop
export HADOOP_COMMON_HOME="/home/hadoop/hadoop"
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_COMMON_HOME/bin/
#+end_src
#+begin_src sh*** Update conf/core-site.xml
export JAVA_HOME=/usr/lib/jvm/java-6-sun
export HADOOP_OPTS=-Djava.net.preferIPv4Stack=true
#+end_src
<?xml version="1.0"?>*** Update conf/mapred-site.xml
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- In: conf/core-site.xml -->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/tmp</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://128.125.86.89:54310</value>
<description>The name of the default file system. A URI whose
scheme and authority determine the FileSystem implementation. The
uri's scheme determines the config property (fs.SCHEME.impl) naming
the FileSystem implementation class. The uri's authority is used to
determine the host, port, etc. for a filesystem.</description>
</property>
</configuration>
<?xml version="1.0"?>*** Update conf/hdfs-site.xml
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- In: conf/mapred-site.xml -->
<property>
<name>mapreduce.jobtracker.address</name>
<value>128.125.86.89:54311</value>
</property>
</configuration>
#+begin_src html*** Update conf/masters (master node only)
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- In: conf/hdfs-site.xml -->
<property>
<name>dfs.replication</name>
<value>3</value>
<description>Default block replication.
The actual number of replications can be specified when the file is created.
The default is used if replication is not specified in create time.
</description>
</property>
</configuration>
#+end_src
#+begin_src sh*** Update conf/slaves (master node only)
128.125.86.89
#+end_src
#+begin_src sh*** Copy hadoop installation and configuration files to slave nodes
128.125.86.89
slave-ip1
slave-ip2
......
#+end_src
#+begin_src sh** Run Hadoop
# In the master node
su hadoop
for node in $(cat /conf/slaves);
do
scp ~/.bashrc hadoop@$node:~; scp -r ~/hadoop hadoop@#node:~;
done
#+end_src
*** Format HDFS
#+begin_src sh*** Start Hadoop
hdfs namenode -format
#+end_src
#+begin_src sh*** Run Jobs
start-dfs.sh && sleep 300 && start-mapred.sh && echo "GOOD"
#+end_src
#+begin_src sh*** Stop Hadoop
hadoop jar hadoop pipes
#+end_src
#+begin_src sh** References:
stop-mapred.sh && stop-dfs.sh
#+end_src
1. http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-single-node-cluster/
2. http://www.michael-noll.com/tutorials/running-hadoop-on-ubuntu-linux-multi-node-cluster/
3. http://fclose.com/b/cloud-computing/290/hadoop-tutorial/
4. Fix could only be replicated to 0 nodes instead of 1 error