Spark集群环境搭建 - xingzhihe/hello-world GitHub Wiki

环境

ip hostname description
192.168.80.1 node-801 NameNode,ResourceManager,Worker
192.168.80.10 node-8010 SecondaryNameNode,DataNode,Worker
192.168.80.20 node-8020 DataNode,Master
192.168.80.30 node-8030 DataNode,Worker
192.168.80.40 node-8040 DataNode,Worker

  操作系统:CentOS 7.1
  install lrzsz:yum install -y lrzsz

准备工作

SSH免密登陆

ssh-keygen -t rsa
cat id_rsa.pub >> authorized_keys

ssh [email protected]
scp id_rsa.pub [email protected]:/root/.ssh/id_rsa_30.pub
exit
cat id_rsa_30.pub >> authorized_keys

软件下载

    jdk-8u152-linux-x64    scala-2.11.8    hadoop-2.7.4    spark-2.1.2-bin-hadoop2.7

JDK、SCALA

tar -xzvf jdk-8u152-linux-x64.tar.gz

vi /etc/profile
    export JAVA_HOME=/home/jdk1.8.0_152
    export PATH=$JAVA_HOME/bin:$PATH
source /etc/profile

java -version



tar -xzvf scala-2.11.8.tgz

vi /etc/profile
    export JAVA_HOME=/home/jdk1.8.0_152
    export SCALA_HOME=/home/scala-2.11.8
    export PATH=$JAVA_HOME/bin:$SCALA_HOME/bin:$PATH
source /etc/profile

scala -version

HADOOP

tar -xzvf hadoop-2.7.4.tar.gz
mkdir /data/hadoop/tmp,/data/hadoop/hdfs/data,/data/hadoop/hdfs/name

vi core-site.xml
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://192.168.80.1:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:/data/hadoop/tmp</value>
    </property>
    <property>
        <name>io.file.buffer.size</name>
        <value>131702</value>
    </property>
vi hdfs-site.xml
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/data/hadoop/hdfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/data/hadoop/hdfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>192.168.80.10:9001</value>
    </property>
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
        <value>false</value>
    </property>
vi mapred-site.xml
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>192.168.80.1:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>192.168.80.1:19888</value>
    </property>
vi yarn-site.xml
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>192.168.80.1:8032</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>192.168.80.1:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>192.168.80.1:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address</name>
        <value>192.168.80.1:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>192.168.80.1:8088</value>
    </property>
vi hadoop-env.sh
    export JAVA_HOME=/home/jdk1.8.0_152
vi yarn-env.sh
    export JAVA_HOME=/home/jdk1.8.0_152
vi slaves
    192.168.80.10
    192.168.80.20
bin/hdfs namenode -format
sbin/start-all.sh

or

sbin/start-dfs.sh
sbin/start-yarn.sh
systemctl status firewalld.service
systemctl stop firewalld.service
systemctl disable firewalld.service

http://192.168.80.1:8088/
http://192.168.80.1:50070/

SPARK

tar -xzvf spark-2.1.2-bin-hadoop2.7.tgz

vi /etc/profile
    export JAVA_HOME=/home/jdk1.8.0_152
    export SCALA_HOME=/home/scala-2.11.8
    export HADOOP_HOME=/home/hadoop-2.7.4
    export SPARK_HOME=/home/spark-2.1.2-bin-hadoop2.7
    export PATH=$JAVA_HOME/bin:$SCALA_HOME/bin:$HADOOP_HOME/sbin:$SPARK_HOME/bin:$PATH

source /etc/profile

cp spark-env.sh.template spark-env.sh
cp slaves.template slaves

vi spark-env.sh
    #指定JAVA_HOME位置
    export JAVA_HOME=/home/jdk1.8.0_152
    #指定spark老大Master的IP
    export SPARK_MASTER_IP=192.168.80.20
    #指定spark老大Master的端口
    export SPARK_MASTER_PORT=7077
    #指定可用的CPU内核数量(默认:所有可用,实际使用时没有配置最后这两个参数)
    export SPARK_WORKER_CORES=4
    #作业可使用的内存容量,默认格式为1000m或者2g(默认:所有RAM去掉给操作系统用的1GB)
    export SPARK_WORKER_MEMORY=2g
vi slaves
    192.168.80.1
    192.168.80.10
    192.168.80.30
    192.168.80.40
sbin/start-all.sh

or

sbin/start-master.sh -h 192.168.80.20
sbin/start-slave.sh spark://192.168.80.20:7077
bin/spark-shell  --master spark://192.168.80.20:7077
sc.textFile("hdfs://192.168.80.1:9000/softwares/集群环境搭建.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile("hdfs://192.168.80.1:9000/output")

http://192.168.80.20:8080/

Sqoop

tar -xzvf sqoop-1.99.7-bin-hadoop200.tar.gz
mkdir extra
cp mysql-connector-java-6.0.5.jar extra/

vi conf/sqoop.properties
    org.apache.sqoop.submission.engine.mapreduce.configuration.directory=/home/hadoop-2.7.4/etc/hadoop

    org.apache.sqoop.security.authentication.type=SIMPLE  
    org.apache.sqoop.security.authentication.handler=org.apache.sqoop.security.authentication.SimpleAuthenticationHandler  
    org.apache.sqoop.security.authentication.anonymous=true 
vi /home/hadoop-2.7.4/etc/hadoop/core-site.xml
    <property>
      <name>hadoop.proxyuser.root.hosts</name>
      <value>*</value>
    </property>
    <property>
      <name>hadoop.proxyuser.root.groups</name>
      <value>*</value>
    </property>
bin/sqoop2-tool verify
bin/sqoop2-server start
#创建 link和job 参考文档:https://yq.aliyun.com/articles/73582
bin/sqoop2-shell
sqoop:000> set server --host localhost --port 12000 --webapp sqoop
sqoop:000> show version --all
sqoop:000> set option --name verbose --value true
sqoop:000> show connector
sqoop:000> create link -c generic-jdbc-connector  ( mysql_company_link)
sqoop:000> create link -c hdfs-connector ( hdfs_gs_company_link)
sqoop:000> create job -f "mysql_company_link" -t "hdfs_gs_company_link"  ( job_gs_company)
sqoop:000> start job -n job_gs_company
#启动JobHistory服务
/home/hadoop-2.7.4/sbin/mr-jobhistory-daemon.sh start historyserver
⚠️ **GitHub.com Fallback** ⚠️