Hadoop Map Reduce - Nantawat6510545543/big-data-summary GitHub Wiki
nano ~/reducer.py
Paste:
#!/usr/bin/env python3
"""reducer.py"""
from operator import itemgetter
import sys
current_word = None
current_count = 0
for line in sys.stdin:
line = line.strip()
word, count = line.split('\t', 1)
try:
count = int(count)
except ValueError:
continue
if current_word == word:
current_count += count
else:
if current_word:
print('%s\t%s' % (current_word, current_count))
current_word = word
current_count = count
if current_word:
print('%s\t%s' % (current_word, current_count))
nano ~/mapper.py
Paste:
#!/usr/bin/env python3
"""mapper.py"""
import sys
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print('%s\t1' % word)
Make both scripts executable:
chmod +x mapper.py reducer.py
Download sample text:
wget https://www.gutenberg.org/files/27045/27045.txt
Put file into HDFS:
hadoop dfs -put 27045.txt /
hadoop dfs -ls /
echo "foo foo quux labs foo bar quux" | ./mapper.py
cat 27045.txt | ./mapper.py
nano $HADOOP_HOME/etc/hadoop/mapred-site.xml
Add if missing:
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.1.jar \
-file ./mapper.py -mapper ./mapper.py \
-file ./reducer.py -reducer ./reducer.py \
-input /27045.txt -output /gutenberg
To rerun, clear previous output:
hadoop dfs -rm -r -f /gutenberg
hadoop dfs -cat /gutenberg/part-00000
!source ~/.bashrc
!~/hadoop/bin/hadoop
!~/hadoop/bin/hadoop dfs -ls /
!~/hadoop/bin/hadoop dfs -rm -r -f /gutenberg
!~/hadoop/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.1.jar \
-file ./mapper.py -mapper ./mapper.py \
-file ./reducer.py -reducer ./reducer.py \
-input /27045.txt -output /gutenberg