Spark集群环境搭建 - xingzhihe/hello-world GitHub Wiki
ip | hostname | description |
---|---|---|
192.168.80.1 | node-801 | NameNode,ResourceManager,Worker |
192.168.80.10 | node-8010 | SecondaryNameNode,DataNode,Worker |
192.168.80.20 | node-8020 | DataNode,Master |
192.168.80.30 | node-8030 | DataNode,Worker |
192.168.80.40 | node-8040 | DataNode,Worker |
操作系统:CentOS 7.1
install lrzsz:yum install -y lrzsz
ssh-keygen -t rsa
cat id_rsa.pub >> authorized_keys
ssh [email protected]
scp id_rsa.pub [email protected]:/root/.ssh/id_rsa_30.pub
exit
cat id_rsa_30.pub >> authorized_keys
jdk-8u152-linux-x64 scala-2.11.8 hadoop-2.7.4 spark-2.1.2-bin-hadoop2.7
tar -xzvf jdk-8u152-linux-x64.tar.gz
vi /etc/profile
export JAVA_HOME=/home/jdk1.8.0_152
export PATH=$JAVA_HOME/bin:$PATH
source /etc/profile
java -version
tar -xzvf scala-2.11.8.tgz
vi /etc/profile
export JAVA_HOME=/home/jdk1.8.0_152
export SCALA_HOME=/home/scala-2.11.8
export PATH=$JAVA_HOME/bin:$SCALA_HOME/bin:$PATH
source /etc/profile
scala -version
tar -xzvf hadoop-2.7.4.tar.gz
mkdir /data/hadoop/tmp,/data/hadoop/hdfs/data,/data/hadoop/hdfs/name
vi core-site.xml
<property>
<name>fs.defaultFS</name>
<value>hdfs://192.168.80.1:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/data/hadoop/tmp</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131702</value>
</property>
vi hdfs-site.xml
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/data/hadoop/hdfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/data/hadoop/hdfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>192.168.80.10:9001</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
vi mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>192.168.80.1:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>192.168.80.1:19888</value>
</property>
vi yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>192.168.80.1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>192.168.80.1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>192.168.80.1:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>192.168.80.1:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>192.168.80.1:8088</value>
</property>
vi hadoop-env.sh
export JAVA_HOME=/home/jdk1.8.0_152
vi yarn-env.sh
export JAVA_HOME=/home/jdk1.8.0_152
vi slaves
192.168.80.10
192.168.80.20
bin/hdfs namenode -format
sbin/start-all.sh
or
sbin/start-dfs.sh
sbin/start-yarn.sh
systemctl status firewalld.service
systemctl stop firewalld.service
systemctl disable firewalld.service
http://192.168.80.1:8088/
http://192.168.80.1:50070/
tar -xzvf spark-2.1.2-bin-hadoop2.7.tgz
vi /etc/profile
export JAVA_HOME=/home/jdk1.8.0_152
export SCALA_HOME=/home/scala-2.11.8
export HADOOP_HOME=/home/hadoop-2.7.4
export SPARK_HOME=/home/spark-2.1.2-bin-hadoop2.7
export PATH=$JAVA_HOME/bin:$SCALA_HOME/bin:$HADOOP_HOME/sbin:$SPARK_HOME/bin:$PATH
source /etc/profile
cp spark-env.sh.template spark-env.sh
cp slaves.template slaves
vi spark-env.sh
#指定JAVA_HOME位置
export JAVA_HOME=/home/jdk1.8.0_152
#指定spark老大Master的IP
export SPARK_MASTER_IP=192.168.80.20
#指定spark老大Master的端口
export SPARK_MASTER_PORT=7077
#指定可用的CPU内核数量(默认:所有可用,实际使用时没有配置最后这两个参数)
export SPARK_WORKER_CORES=4
#作业可使用的内存容量,默认格式为1000m或者2g(默认:所有RAM去掉给操作系统用的1GB)
export SPARK_WORKER_MEMORY=2g
vi slaves
192.168.80.1
192.168.80.10
192.168.80.30
192.168.80.40
sbin/start-all.sh
or
sbin/start-master.sh -h 192.168.80.20
sbin/start-slave.sh spark://192.168.80.20:7077
bin/spark-shell --master spark://192.168.80.20:7077
sc.textFile("hdfs://192.168.80.1:9000/softwares/集群环境搭建.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile("hdfs://192.168.80.1:9000/output")
tar -xzvf sqoop-1.99.7-bin-hadoop200.tar.gz
mkdir extra
cp mysql-connector-java-6.0.5.jar extra/
vi conf/sqoop.properties
org.apache.sqoop.submission.engine.mapreduce.configuration.directory=/home/hadoop-2.7.4/etc/hadoop
org.apache.sqoop.security.authentication.type=SIMPLE
org.apache.sqoop.security.authentication.handler=org.apache.sqoop.security.authentication.SimpleAuthenticationHandler
org.apache.sqoop.security.authentication.anonymous=true
vi /home/hadoop-2.7.4/etc/hadoop/core-site.xml
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
bin/sqoop2-tool verify
bin/sqoop2-server start
#创建 link和job 参考文档:https://yq.aliyun.com/articles/73582
bin/sqoop2-shell
sqoop:000> set server --host localhost --port 12000 --webapp sqoop
sqoop:000> show version --all
sqoop:000> set option --name verbose --value true
sqoop:000> show connector
sqoop:000> create link -c generic-jdbc-connector ( mysql_company_link)
sqoop:000> create link -c hdfs-connector ( hdfs_gs_company_link)
sqoop:000> create job -f "mysql_company_link" -t "hdfs_gs_company_link" ( job_gs_company)
sqoop:000> start job -n job_gs_company
#启动JobHistory服务
/home/hadoop-2.7.4/sbin/mr-jobhistory-daemon.sh start historyserver