最近开发的数据库脱敏系统含有多数据源,今天单独虚出一个机器部署大数据环境,好久以前部署过,今天有部署一次,整理记录下来,以后会用到。
教程中涉及到的安装包通过关注公众账号,发送消息hadoop获取
一、docker搭建
创建master为基础的容器
// 获取ubuntu镜像
docker pull ubuntu
// 查看镜像
docker images
// 创建桥接网络,连接宿主机,master和两个slave
docker network create -d bridge spark-net
docker network ls
// 10000 hive端口
// 16030 hbase
// 8888 spark
// 7077(spark的master和slave通信)
// 50070 hadoop
docker run -itd --name master --network spark-net -h master -p 10000:10000 -p 16030:16030 -p 8888:8888 -p 50070:50070 -p 7077:7077 ubuntu
创建master容器hadoop用户
// 进入容器
docker exec -it master /bin/bash
// 创建hadoop账户
root@master:/# adduser hadoop
// 把hadoop用户加入到hadoop用户组
root@master:/# usermod -a -G hadoop hadoop
root@master:/# cat /etc/group |grep hadoop
// 把hadoop用户赋予root权限
root@master:/# vim /etc/sudoers
// sudoers中添加下面内容
hadoop ALL=(root) NOPASSWD:ALL
安装配置jdk
// 切换用户为hadoop
su hadoop
// 安装java jdk
sudo apt update
sudo apt install default-jdk
java -version
// 配置环境变量打开.bashrc文件
vim ~/.bashrc
// .bashrc最后加上下面内容 1.8.0_333是安装的java版本
export JAVA_HOME=/usr/java/jdk1.8.0_333
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$JAVA_HOME/bin:$PATH
// 使环境变量生效
source ~/.bashrc
把master作为基础镜像、避免重复安装
docker commit master base
// 通过base创建slave1容器
docker run -itd --name slave1 --network spark-net -h slave1 base
// 通过base创建slave2容器
docker run -itd --name slave2 --network spark-net -h slave2 base
二、SSH免密登录
1、master ssh服务安装
// 进入容器
docker exec -it master /bin/bash
// 切换为hadoop用户
su hadoop
// 安装openssh
apt-get install openssh-server
// 启动ssh
service ssh start
// 修改配置
vim /etc/ssh/sshd_config
// 文件结尾增加下面内容
PasswordAuthentication yes
// 配置免密登录
ssh-keygen -t rsa -P "" // 多次回车在~/.ssh 目录下生成id_rsa和id_rsa.pub文件
2、slave1 ssh服务安装
// 进入容器
docker exec -it slave1 /bin/bash
// 切换为hadoop用户
su hadoop
// 安装openssh
apt-get install openssh-serve
// 启动ssh
service ssh start
// 修改配置
vim /etc/ssh/sshd_config
// 文件结尾增加下面内容
PasswordAuthentication yes
// 配置免密登录
ssh-keygen -t rsa -P "" // 多次回车在~/.ssh 目录下生成id_rsa和id_rsa.pub文件
// 拷贝slave1中的id_rsa.pub到master容器中
scp id_rsa.pub hadoop/tmp/slave1_id_rsa.pub :
3、slave2 ssh服务安装
// 进入容器
docker exec -it slave2 /bin/bash
// 切换为hadoop用户
su hadoop
// 安装openssh
apt-get install openssh-server
// 启动ssh
service ssh start
// 修改配置
vim /etc/ssh/sshd_config
// 文件结尾增加下面内容
PasswordAuthentication yes
// 配置免密登录
ssh-keygen -t rsa -P "" // 多次回车在~/.ssh 目录下生成id_rsa和id_rsa.pub文件
// 拷贝slave2下的id_rsa.pub到master容器中
scp id_rsa.pub hadoop/tmp/slave2_id_rsa.pub :
4、设置免密登录
master 容器中进行下面操作
mv /tmp/slave1_id_rsa.pub ~/.ssh/slave1_id_rsa.pub
mv /tmp/slave2_id_rsa.pub ~/.ssh/slave2_id_rsa.pub
将id_rsa.pub、slave1_id_rsa.pub、slave2_id_rsa.pub追加到authorized_keys授权文件中
cd ~/.ssh
cat *.pub >>authorized_keys
chmod 664 authorized_keys
这时就可以在slave1 slave2 容器中直接执行ssh master登录master了
// 把master容器的授权文件拷贝到slave1 slave2容器中
scp authorized_keys hadoop 1:/tmp
scp authorized_keys hadoop 2:/tmp
slave1容器操作
cp /tmp/authorized_keys ~/.ssh/authorized_keys
chmod 664 authorized_keys
slave2 容器操作
cp /tmp/authorized_keys ~/.ssh/authorized_keys
chmod 664 authorized_keys
通过上面的操作我们就可以在三个容器间免密登录访问
// 直接登录slave1
hadoop :~$ ssh slave1
// 直接登录slave2
hadoop :~$ ssh slave2
// slave1 slave2也可以直接登录master
hadoop 1:/$ ssh master
hadoop 2:/$ ssh master
三、hadoop安装
1、master容器安装hadoop
su hadoop
cd /usr/local
sudo mv hadoop-2.7.4 hadoop
sudo chmod 777 -R /usr/local/Hadoop
// 配置环境变量
vim ~/.bashrc
// .bashrc文件尾部添加如下内容
export HADOOP_INSTALL=/usr/local/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin
export PATH=$PATH:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_INSTALL/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_INSTALL/lib"
// 环境变量生效
source ~/.bashrc
修改hadoop-env.sh中设置hadoop软件的环境变量
vim /usr/local/hadoop/etc/hadoop/hadoop-env.sh
// hadoop-env.sh文件尾部增加下面内容
export JAVA_HOME=/usr/java/jdk1.8.0_333
export HADOOP=/usr/local/hadoop
export PATH=$PATH:/usr/local/hadoop/bin
修改hadoop yarn环境变量
vim /usr/local/hadoop/etc/hadoop/yarn-env.sh
JAVA_HOME=/usr/java/jdk1.8.0_333
配置core-site.xml
cd /usr/local/hadoop
vim etc/hadoop/core-site.xml
configuration 替换下面内容,连接hdfs
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:9000</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
</configuration>
配置hdfs-site.xml
vim etc/hadoop/hdfs-site.xml
configuration 替换下面内容,hdfs分布式文件系统配置
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/data</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
</configuration>
配置yarn-site.xml 通用资源观,里系统和调度平台
<configuration>
<!-- Site specific YARN configuration properties -->
<property>|
<name>yarn.nodemanager.aux-services</name>|
<value>mapreduce_shuffle</value>|
</property>|
<property>|
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>|
<value>org.apache.hadoop.mapred.ShuffleHandler</value>|
</property>|
<property>|
<name>yarn.resourcemanager.address</name>|
<value>master:8032</value>|
</property>|
<property>|
<name>yarn.resourcemanager.scheduler.address</name>|
<value>master:8030</value>|
</property>|
<property>|
<name>yarn.resourcemanager.resource-tracker.address</name>|
<value>master:8031</value>|
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>master:18141</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>master:18088</value>
</property>
配置slave
vim etc/hadoop/slaves
// slaves文件中增加下面内容
slave1
slave2
2、slave1安装hadoop
// 把master中的整个hadoop目录复制到slave1一份
scp -r /usr/local/hadoop hadoop/usr/local :
3、slave2安装hadoop
// 把master中的整个hadoop目录复制到slave2一份
scp -r /usr/local/hadoop hadoop/usr/local :
4、 启动测试
启动hadoop,只能在master运行
// 新安装完第一次执行启动要执行下面命令
hdfs namenode -format
// 启动hadoop
start-all.sh
// 测试hadoop集群是否正常运行
hadoop@master:/usr/local/hadoop$ hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.4.jar pi 10 10
## 输出内容如下
Number of Maps = 10
Samples per Map = 10
Wrote input for Map #0
Wrote input for Map #1
Wrote input for Map #2
Wrote input for Map #3
Wrote input for Map #4
Wrote input for Map #5
Wrote input for Map #6
Wrote input for Map #7
Wrote input for Map #8
Wrote input for Map #9
Starting Job
Job Finished in 1.246 seconds
Estimated value of Pi is 3.20000000000000000000
5、监控查看
# 通过jps查看master slave 各服务运行情况
jps
# 可以通过hdfs 或者hadoop命令存储分布式文件
# 查看根目录下存储情况
hdfs dfs -ls /
# 具体查看hdfs常用命令
hadoop web查看
http://ip:50070/dfshealth.html#tab-overview
四、安装spark
1、master安装
tar -zxvf scala-2.11.8.tgz
tar -zxvf spark-2.4.7-bin-hadoop2.7.tgz
mv scala-2.11.8 /usr/local/
mv spark-2.4.7-bin-hadoop2.7 /usr/local
# 环境变量
vim ~/.bashrc
# .bashrc文件最后添加如下内容
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
# 环境便变量生效
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
修改配置信息
cd /usr/local/spark-2.4.7-bin-hadoop2.7/conf/
mv spark-env.sh.template spark-env.sh
mv slaves.template slaves
spark-env.sh配置修改
SPARK_MASTER_WEBUI_PORT=8888
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export HADOOP_HOME=/usr/local/hadoop
export MASTER=spark://master:7077
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_MASTER_HOST=master
export JAVA_HOME=/usr/java/jdk1.8.0_333
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_WORKER_MERMORY=4G
slaves文件修改
slave1
slave2
2、slave1安装
# 拷贝master上的spark到slave1
scp -r /usr/local/spark-2.4.7-bin-hadoop2.7 hadoop@slave1:/usr/local/
# 拷贝scala到slave1
scp -r /usr/local/scala-2.11.8 hadoop@slave1:/usr/local/
配置环境变量
# 环境变量
vim ~/.bashrc
# .bashrc文件最后添加如下内容
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
# 环境便变量生效
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
3、slave2安装
# 拷贝master上的spark到slave2
scp -r /usr/local/spark-2.4.7-bin-hadoop2.7 hadoop@slave2:/usr/local/
# 拷贝scala到slave2
scp -r /usr/local/scala-2.11.8 hadoop@slave2:/usr/local/
配置环境变量
# 环境变量
vim ~/.bashrc
# .bashrc文件最后添加如下内容
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
# 环境便变量生效
export SCALA_HOME=/usr/local/scala-2.11.8
export SPARK_HOME=/usr/local/spark-2.4.7-bin-hadoop2.7
export PATH=$PATH:$SCALA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin
4、启动spark
# 启动spart
/usr/local/spark-2.4.7-bin-hadoop2.7/sbin/start-all.sh
# 单独启动master
/usr/local/spark-2.4.7-bin-hadoop2.7/sbin/start-all.sh
# 启动重节点
/usr/local/spark-2.4.7-bin-hadoop2.7/sbin/start-slave.sh spark://master:7077
# 通过spark-shell操作scala语言进行人机交互
/usr/local/spark-2.4.7-bin-hadoop2.7/sbin/spark-shell
# 如果安装hadoop,并hadoop/bin加入环境变量启动hadoop的start-all后就不用启动spark了.
start-all.sh
# 通过jps 发现有master上Master进程 slave1和slave2有Worker进程表示启动成功
hadoop@master:/usr/local/spark-2.4.7-bin-hadoop2.7/sbin$ jps
275685 Jps
7043 RunJar
2915 NameNode
17028 HMaster
3111 SecondaryNameNode
3576 Master # master
3276 ResourceManager
7293 RunJar
207500 QuorumPeerMain
通过web地址查看spark运行情况 http://ip:8888/
原文始发于微信公众号(数据安全治理技术):大数据环境hadoop spark的安装
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论