Kubernetes Cluster on 4 Ubuntu servers - CloudCommandos/JohnChan GitHub Wiki
- All 4 servers are running Ubuntu 18.04.3 LTS
- You have access to all 4 servers via SSH or Out Of Band management
- /dev/sda1 is used as EFI boot directory and is mounted at
/boot/efi
- /dev/sda2 is used as root file system and is mounted at
/
- /dev/sdb is used as local disk and is mounted at '/mnt/sdb'
- All other HDDs and SSDs will be used as Ceph's OSDs (steps included in this guide) Use root account for all steps.
Disable cloud-init network config
echo "network: {config: disabled}" > /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg
Edit /etc/netplan/50-cloud-init.yaml of each node
kube1:
nano /etc/netplan/50-cloud-init.yaml
network:
ethernets:
eno4:
dhcp4: true
enp0s20f0u1u6:
dhcp4: true
ens3f0:
dhcp4: false
addresses: [10.0.1.1/24]
ens3f1:
dhcp4: false
addresses: [10.1.1.1/24]
version: 2
kube2:
nano /etc/netplan/50-cloud-init.yaml
network:
ethernets:
eno4:
dhcp4: true
enp0s20f0u1u6:
dhcp4: true
ens3f0:
dhcp4: false
addresses: [10.0.1.2/24]
ens3f1:
dhcp4: false
addresses: [10.1.1.2/24]
version: 2
kube3:
nano /etc/netplan/50-cloud-init.yaml
network:
ethernets:
eno4:
dhcp4: true
enp0s20f0u1u6:
dhcp4: true
ens3f0:
dhcp4: false
addresses: [10.0.1.3/24]
ens3f1:
dhcp4: false
addresses: [10.1.1.3/24]
version: 2
kube4:
nano /etc/netplan/50-cloud-init.yaml
network:
ethernets:
eno4:
dhcp4: true
enp0s20f0u1u6:
dhcp4: true
ens3f0:
dhcp4: false
addresses: [10.0.1.4/24]
ens3f1:
dhcp4: false
addresses: [10.1.1.4/24]
version: 2
netplan generate --debug
netplan apply
Update /etc/hosts
...
10.0.1.1 kube1
10.0.1.2 kube2
10.0.1.3 kube3
10.0.1.4 kube4
10.1.1.1 kube1-ceph
10.1.1.2 kube2-ceph
10.1.1.3 kube3-ceph
10.1.1.4 kube4-ceph
...
Allow password ssh authentication for all nodes
nano /etc/ssh/sshd_config
...
PermitRootLogin yes
PasswordAuthentication yes
...
systemctl restart sshd
Copy ssh keys from main node to other nodes
ssh-copy-id kube2
ssh-copy-id kube3
ssh-copy-id kube4
Install and configure NTP on all nodes
apt-get install ntp
change default ntp configuration's NTP server pool
nano /etc/ntp.conf
...
pool 0.sg.pool.ntp.org iburst
pool 1.sg.pool.ntp.org iburst
pool 2.sg.pool.ntp.org iburst
pool 3.sg.pool.ntp.org iburst
...
systemctl restart ntp
Install python-minimal on all nodes (required by ceph-deploy)
apt-get install -y python-minimal
apt-get install -y apt-transport-https curl
wget -q -O- 'https://download.ceph.com/keys/release.asc' | apt-key add -
echo deb https://download.ceph.com/debian-nautilus/ $(lsb_release -sc) main | tee /etc/apt/sources.list.d/ceph.list
apt update
apt install -y ceph-deploy
mkdir my-ceph
cd my-ceph
ceph-deploy new kube1
Edit ceph.conf
nano ceph.conf
# change ip to kube1-ceph's ip
mon_host = 10.1.1.1
# and add in the following:
public network = 10.1.1.0/24
ceph-deploy install kube1 kube2 kube3 kube4
ceph-deploy mon create-initial
ceph-deploy admin kube1 kube2 kube3 kube4
ceph-deploy mon add kube2 --address 10.1.1.2
ceph-deploy mon add kube3 --address 10.1.1.3
ceph-deploy mon add kube4 --address 10.1.1.4
ceph-deploy mgr create kube1 kube2 kube3 kube4
ceph-deploy mds create kube1
ceph-deploy disk zap kube1 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm /dev/sdn /dev/sdo /dev/sdp /dev/sdq
ceph-deploy disk zap kube2 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm /dev/sdn /dev/sdo /dev/sdp /dev/sdq
ceph-deploy disk zap kube3 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm /dev/sdn /dev/sdo /dev/sdp /dev/sdq
ceph-deploy disk zap kube4 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh /dev/sdi /dev/sdj /dev/sdk /dev/sdl /dev/sdm /dev/sdn /dev/sdo /dev/sdp /dev/sdq
ceph-deploy osd create kube1 --data /dev/sdc
ceph-deploy osd create kube1 --data /dev/sdd
ceph-deploy osd create kube1 --data /dev/sde
ceph-deploy osd create kube1 --data /dev/sdf
ceph-deploy osd create kube1 --data /dev/sdg
ceph-deploy osd create kube1 --data /dev/sdh
ceph-deploy osd create kube1 --data /dev/sdi
ceph-deploy osd create kube1 --data /dev/sdj
ceph-deploy osd create kube1 --data /dev/sdk
ceph-deploy osd create kube1 --data /dev/sdl
ceph-deploy osd create kube1 --data /dev/sdm
ceph-deploy osd create kube1 --data /dev/sdn
ceph-deploy osd create kube1 --data /dev/sdo
ceph-deploy osd create kube1 --data /dev/sdp
ceph-deploy osd create kube1 --data /dev/sdq
ceph-deploy osd create kube2 --data /dev/sdc
ceph-deploy osd create kube2 --data /dev/sdd
ceph-deploy osd create kube2 --data /dev/sde
ceph-deploy osd create kube2 --data /dev/sdf
ceph-deploy osd create kube2 --data /dev/sdg
ceph-deploy osd create kube2 --data /dev/sdh
ceph-deploy osd create kube2 --data /dev/sdi
ceph-deploy osd create kube2 --data /dev/sdj
ceph-deploy osd create kube2 --data /dev/sdk
ceph-deploy osd create kube2 --data /dev/sdl
ceph-deploy osd create kube2 --data /dev/sdm
ceph-deploy osd create kube2 --data /dev/sdn
ceph-deploy osd create kube2 --data /dev/sdo
ceph-deploy osd create kube2 --data /dev/sdp
ceph-deploy osd create kube2 --data /dev/sdq
ceph-deploy osd create kube3 --data /dev/sdc
ceph-deploy osd create kube3 --data /dev/sdd
ceph-deploy osd create kube3 --data /dev/sde
ceph-deploy osd create kube3 --data /dev/sdf
ceph-deploy osd create kube3 --data /dev/sdg
ceph-deploy osd create kube3 --data /dev/sdh
ceph-deploy osd create kube3 --data /dev/sdi
ceph-deploy osd create kube3 --data /dev/sdj
ceph-deploy osd create kube3 --data /dev/sdk
ceph-deploy osd create kube3 --data /dev/sdl
ceph-deploy osd create kube3 --data /dev/sdm
ceph-deploy osd create kube3 --data /dev/sdn
ceph-deploy osd create kube3 --data /dev/sdo
ceph-deploy osd create kube3 --data /dev/sdp
ceph-deploy osd create kube3 --data /dev/sdq
ceph-deploy osd create kube4 --data /dev/sdc
ceph-deploy osd create kube4 --data /dev/sdd
ceph-deploy osd create kube4 --data /dev/sde
ceph-deploy osd create kube4 --data /dev/sdf
ceph-deploy osd create kube4 --data /dev/sdg
ceph-deploy osd create kube4 --data /dev/sdh
ceph-deploy osd create kube4 --data /dev/sdi
ceph-deploy osd create kube4 --data /dev/sdj
ceph-deploy osd create kube4 --data /dev/sdk
ceph-deploy osd create kube4 --data /dev/sdl
ceph-deploy osd create kube4 --data /dev/sdm
ceph-deploy osd create kube4 --data /dev/sdn
ceph-deploy osd create kube4 --data /dev/sdo
ceph-deploy osd create kube4 --data /dev/sdp
ceph-deploy osd create kube4 --data /dev/sdq
NOTE: To remove an osd, ssh to the node where the osd is then run the following commands:
ceph osd out {id}
systemctl stop ceph-osd@{id}
ceph osd purge {id} --yes-i-really-mean-it
Set up ceph crushmap to have rules selecting hdd only and ssd only
ceph osd getcrushmap -o old_crushmap_compiled
crushtool -d old_crushmap_compiled -o old_crushmap_decompiled
nano old_crushmap_decompiled
## add in the following:
rule ssd_only {
id 1
type replicated
min_size 1
max_size 10
step take default class ssd
step chooseleaf firstn 0 type host
step emit
}
rule hdd_only {
id 2
type replicated
min_size 1
max_size 10
step take default class hdd
step chooseleaf firstn 0 type host
step emit
}
Compile the new crushmap then apply it
crushtool -c old_crushmap_decompiled -o new_crushmap_compiled
ceph osd setcrushmap -i new_crushmap_compiled
Create cephfs pools
ceph osd pool create cephfs_data 1024
ceph osd pool create cephfs_metadata 1024
Configure cephfs pools to only use hdd
ceph osd pool set cephfs_data crush_rule hdd_only
ceph osd pool set cephfs_metadata crush_rule hdd_only
Create ssd cache pool
ceph osd pool create ssd_cache_pool 512
Configure ssd cache pool to only use ssd
ceph osd pool set ssd_cache_pool crush_rule ssd_only
Set up cephfs
ceph fs new cephfs cephfs_metadata cephfs_data
Set up ceph cache tier
ceph osd tier add cephfs_data ssd_cache_pool
ceph osd tier cache-mode ssd_cache_pool writeback
ceph osd tier set-overlay cephfs_data ssd_cache_pool
Configure other options of ssd cache pool
ceph osd pool set ssd_cache_pool hit_set_type bloom
ceph osd pool set ssd_cache_pool hit_set_count 1
ceph osd pool set ssd_cache_pool hit_set_period 3600 # 1 hour
ceph osd pool set ssd_cache_pool target_max_bytes 1000000000000 # 1 TB
ceph osd pool set ssd_cache_pool min_read_recency_for_promote 1
ceph osd pool set ssd_cache_pool min_write_recency_for_promote 1
The cephfs directory structure will be as such:
/
kube1/ #dedicated for kube1
kube2/ #dedicated for kube2
kube3/ #dedicated for kube3
kube4/ #dedicated for kube4
shares/ #shared directory for all nodes
container-data/
proj-files/
Create ceph keys for each node's dedicated cephfs directory
ceph auth get-or-create client.kube1 mon 'allow r' mds 'allow rw path=/kube1' osd 'allow *' mgr 'allow *'
ceph auth get-or-create client.kube2 mon 'allow r' mds 'allow rw path=/kube2' osd 'allow *' mgr 'allow *'
ceph auth get-or-create client.kube3 mon 'allow r' mds 'allow rw path=/kube3' osd 'allow *' mgr 'allow *'
ceph auth get-or-create client.kube4 mon 'allow r' mds 'allow rw path=/kube4' osd 'allow *' mgr 'allow *'
Create ceph key for the shared cephfs directory
ceph auth get-or-create client.shares mon 'allow r' mds 'allow rw path=/shares' osd 'allow *' mgr 'allow *'
Save ceph keys into permission protected files
kube1:
mkdir /root/secrets
chmod 700 /root/secrets
ceph auth get client.kube1
nano /root/secrets/cephfs-kube1.key
# paste client.kube1 key here
chmod 700 /root/secrets/cephfs-kube1.key
ceph auth get client.shares
nano /root/secrets/cephfs-shares.key
# paste client.shares key here
chmod 700 /root/secrets/cephfs-shares.key
kube2:
mkdir /root/secrets
chmod 700 /root/secrets
ceph auth get client.kube2
nano /root/secrets/cephfs-kube2.key
# paste client.kube2 key here
chmod 700 /root/secrets/cephfs-kube2.key
ceph auth get client.shares
nano /root/secrets/cephfs-shares.key
# paste client.shares key here
chmod 700 /root/secrets/cephfs-shares.key
kube3:
mkdir /root/secrets
chmod 700 /root/secrets
ceph auth get client.kube3
nano /root/secrets/cephfs-kube3.key
# paste client.kube3 key here
chmod 700 /root/secrets/cephfs-kube3.key
ceph auth get client.shares
nano /root/secrets/cephfs-shares.key
# paste client.shares key here
chmod 700 /root/secrets/cephfs-shares.key
kube4:
mkdir /root/secrets
chmod 700 /root/secrets
ceph auth get client.kube4
nano /root/secrets/cephfs-kube4.key
# paste client.kube4 key here
chmod 700 /root/secrets/cephfs-kube4.key
Cephfs directories will be mounted inside /mnt/cephfs
/mnt/cephfs/kube1 will use client.kube1 ceph key
/mnt/cephfs/kube2 will use client.kube2 ceph key
/mnt/cephfs/kube3 will use client.kube3 ceph key
/mnt/cephfs/kube4 will use client.kube4 ceph key
/mnt/cephfs/shares will use client.shares ceph key
Mount cephfs using /etc/fstab
kube1:
mkdir -p /mnt/cephfs/kube1 /mnt/cephfs/shares
nano /etc/fstab
10.1.1.1:6789:/kube1 /mnt/cephfs/kube1 ceph name=kube1,secretfile=/root/secrets/cephfs-kube1.key,_netdev,noatime 0 0
10.1.1.1:6789:/shares /mnt/cephfs/shares ceph name=shares,secretfile=/root/secrets/cephfs-shares.key,_netdev,noatime 0 0
kube2:
mkdir -p /mnt/cephfs/kube2 /mnt/cephfs/shares
nano /etc/fstab
10.1.1.2:6789:/kube2 /mnt/cephfs/kube2 ceph name=kube2,secretfile=/root/secrets/cephfs-kube2.key,_netdev,noatime 0 0
10.1.1.2:6789:/shares /mnt/cephfs/shares ceph name=shares,secretfile=/root/secrets/cephfs-shares.key,_netdev,noatime 0 0
kube3:
mkdir -p /mnt/cephfs/kube3 /mnt/cephfs/shares
nano /etc/fstab
10.1.1.3:6789:/kube3 /mnt/cephfs/kube3 ceph name=kube3,secretfile=/root/secrets/cephfs-kube3.key,_netdev,noatime 0 0
10.1.1.3:6789:/shares /mnt/cephfs/shares ceph name=shares,secretfile=/root/secrets/cephfs-shares.key,_netdev,noatime 0 0
kube4:
mkdir -p /mnt/cephfs/kube4
nano /etc/fstab
10.1.1.4:6789:/kube4 /mnt/cephfs/kube4 ceph name=kube4,secretfile=/root/secrets/cephfs-kube4.key,_netdev,noatime 0 0
Create bash script to run "mount -a" in case cephfs is not mounted successfully after boot.
Cephfs may fail to mount on boot when multiple servers are rebooting and as a result ceph loses quorum.
Do the following on each node:
mkdir /root/cronjobs
chmod 700 /root/cronjobs
nano /root/cronjobs/fstab-mount-retry
# /root/cronjobs/fstab-mount-retry:
if ! mountpoint -q -- "/mnt/cephfs/$HOSTNAME"; then
mount -a;
fi
chmod 700 /root/cronjobs/fstab-mount-retry
nano /etc/cron.d/fstab-mount-retry
# /etc/cron.d/fstab-mount-retry:
SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
*/30 * * * * root /root/cronjobs/fstab-mount-retry
On each node:
apt-get update
apt-get install -y \
apt-transport-https \
ca-certificates \
curl \
gnupg-agent \
software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
apt-get update
apt-get install -y docker-ce docker-ce-cli containerd.io
If each servers' root file system has limited disk space, you can move Docker's data directory to another mounted disk.
In our case we will be using /dev/sdb, which is mounted at /mnt/sdb.
The /etc/fstab entry is as such:
UUID=<UUID of /dev/sdb> /mnt/sdb ext4 defaults 0 0
If Kubernetes has already been installed, you need to stop Kubelet service first
systemctl stop kubelet
Stop Docker service
systemctl stop docker
Move Docker data directory into /dev/sdb
mv /var/lib/docker /mnt/sdb/
Create symbolic link at /var/lib/docker and point to where the Docker data directory resides
ln -s /mnt/sdb/docker /var/lib/docker
Start Docker service
systemctl start docker
If Kubernetes has already been installed, start Kubelet service
systemctl start kubelet
nano /etc/sysctl.conf
net.ipv4.ip_nonlocal_bind=1
Then run
sysctl -p
Install and enable Heartbeat on each master node
sudo apt-get -y install heartbeat && systemctl enable heartbeat
Create a passkey for Heartbeat authentication
echo -n <any text> | md5sum
ThisIsTheGeneratedKey
Then create the file /etc/ha.d/authkeys
:
auth 1
1 md5 ThisIsTheGeneratedKey
Make sure that all the master nodes have a copy of this file. This file also needs to be owned by root only
sudo chown root:root /etc/ha.d/authkeys
sudo chmod 600 /etc/ha.d/authkeys
Create the main configuration file /etc/ha.d/ha.cf
for Heartbeat on all the master nodes. Change ens18
to your network interface that the node can use to communicate with other HAProxy nodes.
# keepalive: how many seconds between heartbeats
#
keepalive 2
#
# deadtime: seconds-to-declare-host-dead
#
deadtime 10
#
# What UDP port to use for udp or ppp-udp communication?
#
udpport 694
mcast ens3f0 225.0.0.1 694 1 0
# What interfaces to heartbeat over?
udp ens3f0
#
# Facility to use for syslog()/logger (alternative to log/debugfile)
#
logfacility local0
#
# Tell what machines are in the cluster
# node nodename ... -- must match uname -n
node kube1
node kube2
node kube3
Lastly create the /etc/ha.d/haresources
file on all HAProxy nodes to specify the shared IP address and the preferred node to bind that IP.
kube1 10.0.1.99
Restart Heartbeat services on all master nodes
systemctl restart heartbeat
The Kubernetes cluster that we are setting up has 3 master nodes and 1 worker node. All 3 master nodes will also be running workloads.
Install Kubernetes on all nodes
apt-get update && apt-get install -y apt-transport-https curl
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://apt.kubernetes.io/ kubernetes-xenial main
EOF
apt-get update
apt-get install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl
On one master node:
kubeadm init --control-plane-endpoint "10.0.1.99:6443" --upload-certs --pod-network-cidr=192.168.0.0/16
The above command will output two join commands, one for joining in master nodes and another for worker nodes. Run the commands on the rest of the nodes accordingly.
On each master node:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
On one master node, apply Calico as the networking implementation for the k8s cluster. Change the version accordingly.
kubectl apply -f https://docs.projectcalico.org/v3.8/manifests/calico.yaml
Then remove taint from all master nodes so that they can host regular workloads:
kubectl taint nodes --all node-role.kubernetes.io/master-
You can choose between Virtlet or Kubevirt to deploy VMs on Kubernetes.
The recommended option is Virtlet as Virtlet allows you to manage your VMs as if they are pods running in Kubernetes.
This means that StatefulSets, Deployments, HorizontalPodAutoScaling etc. apply to your VMs as well.
Install CRI Proxy on all nodes.
Download the cri proxy package. Change the version accordingly.
wget https://github.com/Mirantis/criproxy/releases/download/v0.14.0/criproxy_0.14.0_amd64.deb
Install the package and select dockershim when prompted.
dpkg -i criproxy_0.14.0_amd64.deb
Label all your nodes with extraRuntime=virtlet
kubectl label node kube1 extraRuntime=virtlet
kubectl label node kube2 extraRuntime=virtlet
kubectl label node kube3 extraRuntime=virtlet
kubectl label node kube4 extraRuntime=virtlet
Create image translation configmap. An example config is given here.
curl https://raw.githubusercontent.com/Mirantis/virtlet/master/deploy/images.yaml >images.yaml
kubectl create configmap -n kube-system virtlet-image-translations --from-file images.yaml
Create virtlet config to disable KVM
kubectl create configmap -n kube-system virtlet-config --from-literal=disable_kvm=y
Download the Virtlet command line tool. Change the version accordingly.
curl -SL -o virtletctl https://github.com/Mirantis/virtlet/releases/download/v1.5.1/virtletctl
chmod +x virtletctl
Deploy Virtlet DaemonSet
./virtletctl gen | kubectl apply -f -
export KUBEVIRT_VERSION=$(curl -s https://api.github.com/repos/kubevirt/kubevirt/releases|grep tag_name|sort -V | tail -1 | awk -F':' '{print $2}' | sed 's/,//' | xargs | cut -d'-' -f1)
wget https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/kubevirt-operator.yaml
kubectl apply -f kubevirt-operator.yaml
wget https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/kubevirt-cr.yaml
kubectl apply -f kubevirt-cr.yaml
curl -L -o virtctl https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/virtctl-${KUBEVIRT_VERSION}-linux-amd64
chmod +x virtctl