Hadoop - mwicat/personal GitHub Wiki

Building native library

protobuf 2.5.0

wget 'https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz'
tar xzvf protobuf-2.5.0.tar.gz
cd protobuf-2.5.0
./configure
make && sudo make install
apt-get install maven protobuf-compiler  build-essential g++ autoconf automake libtool cmake zlib1g-dev pkg-config libssl-dev
wget 'http://ftp.piotrkosoft.net/pub/mirrors/ftp.apache.org/hadoop/common/hadoop-2.6.2/hadoop-2.6.2-src.tar.gz'
tar zxf hadoop-2.6.2-src.tar.gz
cd hadoop-2.6.2
export JAVA_HOME=/usr/lib/jvm/java-7-oracle
mvn package -Pdist,native -DskipTests -Dtar
cp ./hadoop-dist/target/hadoop-2.6.2/lib/native/* /opt/hadoop/lib/native/

Hadoop config

etc/hadoop/core-site.xml:

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:8020</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/storage/hadoop</value>
    </property>
<property>
  <name>dfs.client.read.shortcircuit</name>
  <value>true</value>
</property>

<property>
  <name>dfsclient.read.shortcircuit.skip.checksum</name>
  <value>false</value>
</property>
</configuration>

etc/hadoop/hdfs-site.xml:

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
         <name>dfs.namenode.http-address</name>
         <value>localhost:50070</value>
    </property>
    <property>
         <name>dfs.namenode.secondary.http-address</name>
         <value>localhost:50090</value>
    </property>
    <property>
         <name>dfs.datanode.max.locked.memory</name>
         <value>8000000000</value>
    </property>
<property>
  <name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
  <value>true</value>
</property>
</configuration>

Operations

format: bin/hdfs namenode -format

list nodes: bin/hdfs dfsadmin -report

start: sbin/start-dfs.sh

URLs

namenode: http://localhost:50070/

datanode: http://localhost:50075/

webhdfs: http://localhost:50070/webhdfs/v1/?op=GETHOMEDIRECTORY

mount hdfs:

ubuntu >= 14:

wget https://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh/archive.key -O archive.key
sudo apt-key add archive.key
sudo wget 'https://archive.cloudera.com/cdh5/ubuntu/trusty/amd64/cdh/cloudera.list' \
    -O /etc/apt/sources.list.d/cloudera.list

debian wheezy:

wget https://archive.cloudera.com/cdh5/debian/wheezy/amd64/cdh/archive.key -O archive.key
sudo apt-key add archive.key
sudo wget 'https://archive.cloudera.com/cdh5/debian/wheezy/amd64/cdh/cloudera.list' \
    -O /etc/apt/sources.list.d/cloudera.list 
sudo apt-get update
sudo apt-get install hadoop-hdfs-fuse
mkdir ~/hdfs
sudo hadoop-fuse-dfs dfs://localhost:8020 ~/hdfs

/etc/fstab:

hadoop-fuse-dfs#dfs://localhost:8020 /storage/hdfs fuse allow_other,usetrash,rw 2 0

Reset hdfs

rm -rf /storage/hadoop/*
bin/hdfs namenode -format

List directory

/opt/hadoop/bin/hadoop fs -ls /var/tmp/syncanalyst/ranks

Get file

/opt/hadoop/bin/hadoop fs -cat /var/tmp/syncanalyst/ranks/part-r-02068-35e5e2a6-0a0f-4fd0-8f26-82b481fa2ba8.snappy.parquet

Add file to cache

/opt/hadoop/bin/hdfs cacheadmin -addDirective -path /myfile -pool testPool

Parquet tools

git clone [email protected]:Parquet/parquet-mr.git
cd parquet-mr
mvn install
⚠️ **GitHub.com Fallback** ⚠️