- Install Java
 
sudo apt install default-jdk default-jre
- Install OpenSSH server and client
 
sudo apt install openssh-server openssh-client
- Add new user
 
sudo adduser hadoop
- Switch to new user
 
sudo su - hadoop
- Generate SSH public and private keys
 
ssh-keygen -t rsa
- Add SSH public key to authorized keys
 
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
- Change permissions of authorized keys
 
chmod 640 ~/.ssh/authorized_keys
- Verify SSH configuration
 
ssh localhost
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
- Extract tar file
 
tar -xvzf hadoop-3.3.6.tar.gz
- Rename directory
 
mv hadoop-3.3.6 hadoop
- Obtain OpenJDK directory
 
dirname $(dirname $(readlink -f $(which java)))
- Add environment variables to bash configuration 
~/.bashrc 
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HADOOP_HOME=/home/hadoop/hadoop
- Reload bash settings
 
source ~/.bashrc
- Confirm setup by using example MapReduce Java archive file to obtain word counts
 
mkdir input
cp $HADOOP_HOME/*.txt input
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar  wordcount input output
cat output/*
- Set up local environment as above
 - Add environment variables to bash configuration: 
~/.bashrc 
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
export HADOOP_HOME=/home/hadoop/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
- Reload bash settings
 
source ~/.bashrc
- Create node metadata directories
 
mkdir -p /home/hadoop/hadoop/hdfs/{namenode,datanode}
- Change to Hadoop configuration directory
 
cd hadoop/etc/hadoop
- Add Java environment variables to 
hadoop-env.shfile 
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
- Edit configuration in 
core-site.xml 
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://localhost:9000</value>
  </property>
</configuration>
- Edit configuration in 
hdfs-site.xml 
<configuration>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>dfs.name.dir</name>
    <value>file:///home/hadoop/hadoop/hdfs/namenode</value>
  </property>
  <property>
    <name>dfs.data.dir</name>
    <value>file:///home/hadoop/hadoop/hdfs/datanode</value>
  </property>
</configuration>
- Edit configuration in 
mapred-site.xml 
<configuration> 
  <property> 
    <name>mapreduce.framework.name</name> 
    <value>yarn</value> 
  </property>
  <property>
    <name>mapreduce.application.classpath</name>
    <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
  </property>
</configuration>
- Edit configuration in 
yarn-site.xml 
<configuration>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.env-whitelist</name>
    <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
  </property>
</configuration>
- Switch to Hadoop user account
 
sudo su - hadoop
- Remove previous nodes and format HDFS name node
 
cd ~
rm -R ~/hadoop/hdfs/namenode/*
rm -R ~/hadoop/hdfs/datanode/*
hdfs namenode -format
- Start distributed file cluster
 
start-dfs.sh
- Start resource manager
 
start-yarn.sh
- Check Java virtual machine process status
 
jps
- Access browser interfaces
 
- Name node information: http://localhost:9870
 - Data node information: http://localhost:9864
 - All applications: http://localhost:8088

 
- Create directory, copy, and view files on HDFS
 
hdfs dfs -mkdir /input
hdfs dfs -ls /
hdfs dfs -put ~/input/* /input
hdfs dfs -cat /input/*
- Browse directory on interface: http://localhost:9870/explorer.html
 - Remove file/directory and recursively remove child files/subdirectories
 
hdfs dfs -rm -r /input
- Stop Hadoop cluster
 
stop-dfs.sh
- Stop resource manager
 
stop-yarn.sh
- Put files on HDFS for processing
 
hdfs dfs -mkdir /input
hdfs dfs -put ~/hadoop/etc/hadoop/*.xml /input
- Use example MapReduce Java archive file to find strings starting with 'dfs'
 
hadoop jar ~/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar grep /input /output 'dfs[a-z.]+'
hdfs dfs -cat /output/*
- Compile WordCount2.java and create Java archive
 
hadoop com.sun.tools.javac.Main WordCount2.java
jar cf wc2.jar WordCount2*.class
- Run word count MapReduce job on example files
 
hadoop fs -mkdir /input/example
hadoop fs -put ~/input/example/* /input/example
hadoop jar wc2.jar WordCount /input/example /wordcount/output
hadoop fs -cat /wordcount/output/*
- Re-run word count job with case sensitivity and pattern file
 
hadoop fs -put ~/wordcount/patterns.txt /wordcount
hadoop jar wc2.jar WordCount2 -Dwordcount.case.sensitive=true /input/example /wordcount/output2 -skip /wordcount/patterns.txt
hadoop fs -cat /wordcount/output2/*
- Missing name or data node
 
stop-dfs.sh
rm -R ~/hadoop/hdfs/namenode/*
rm -R ~/hadoop/hdfs/datanode/*
hdfs namenode -format
start-dfs.sh
jps
Apache Hadoop Project. (2023, June 18). Hadoop: Setting up a single node cluster. Apache Software Foundation.
Apache Hadoop Project. (2023, June 18). MapReduce tutorial. Apache Software Foundation.
Kumar, R. (2022, October 28). How to install and configure Hadoop on Ubuntu 20.04. Tec Admin.
Morumbasi, F. (2022, April 21). Installing Hadoop on Ubuntu 20.04. Medium.
Shamar, S. (2023, March 17). Install Hadoop on Ubuntu. Learn Ubuntu.
Tutorials Point. (2014, December). Hadoop Tutorial. Tutorials Point.
