From 433cd65d08cbfd44e906ee97623a898273b6ef06 Mon Sep 17 00:00:00 2001 From: Giuseppe Abrami Date: Sun, 26 Nov 2023 17:31:50 +0100 Subject: [PATCH] Remove Debug Test-Classes --- instructions/kubernetes/README.md | 126 ++++++++++++++++++ .../kubernetes/gpu/device_plugin_install.sh | 11 ++ instructions/kubernetes/gpu/enable_gpu.sh | 25 ++++ instructions/kubernetes/gpu/helm_install.sh | 7 + .../gpu/nvidia_container_toolkit_install.sh | 11 ++ instructions/kubernetes/init_master_node.sh | 35 +++++ instructions/kubernetes/init_worker_node.sh | 26 ++++ .../kubernetes/kubeadm_join_command.sh | 3 + instructions/kubernetes/perma_disable_swap.sh | 5 + instructions/kubernetes/reset_kubeadm.sh | 7 + instructions/kubernetes/setup_networking.sh | 28 ++++ 11 files changed, 284 insertions(+) create mode 100644 instructions/kubernetes/README.md create mode 100644 instructions/kubernetes/gpu/device_plugin_install.sh create mode 100644 instructions/kubernetes/gpu/enable_gpu.sh create mode 100644 instructions/kubernetes/gpu/helm_install.sh create mode 100644 instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh create mode 100644 instructions/kubernetes/init_master_node.sh create mode 100644 instructions/kubernetes/init_worker_node.sh create mode 100644 instructions/kubernetes/kubeadm_join_command.sh create mode 100644 instructions/kubernetes/perma_disable_swap.sh create mode 100644 instructions/kubernetes/reset_kubeadm.sh create mode 100644 instructions/kubernetes/setup_networking.sh diff --git a/instructions/kubernetes/README.md b/instructions/kubernetes/README.md new file mode 100644 index 00000000..d7256f1c --- /dev/null +++ b/instructions/kubernetes/README.md @@ -0,0 +1,126 @@ +# Simple Kubernetes Install Scripts +These shell-scripts were made and used for an easier installation of a Kubernetes cluster on a network of Ubuntu 20.04 systems. I recommend to start on a fresh installation of Ubuntu 20.04. + +# TLDR +- **Create cluster**: Run `init_master_node.sh` to create the cluster and make the system a master node. +- **Add worker node**: Run `init_worker_node.sh` to prepare the system to be added as a worker node. Run `kubeadm_join_command.sh` on the master node and execute output on the to be worker node. +- **GPU capabilities**: Run `enable_gpu.sh` on a worker node with an NVIDIA GPU to make its gpu usable by the cluster. Change files depending on the chosen container runtime; more on this in the section [Configure Docker or Containerd](#configure-docker-or-containerd-for-gpu). + + + +# Basic Cluster setup +## init_master_node +> Run on a system to initialze a cluster and make it a master node. + +Installs Kubernetes packages and makes the system a master node (control plane). Works only for the first master node. For adding consecutive master nodes to the same cluster, the procedure is different. +Installs calico as the pod networking plugin. +Permanently disables swap on the system. + +## init_worker_node +> Run on a system that will be a worker node. + +Installs Kubernetes packages so the node is ready to be added to the cluster as a worker node. To add the system as a worker node run `kubeadm token create --print-join-command` on the master node and execute the output on the system to be added as a worker node. +On the worker node, check if the node has been successfully added by running `kubectl get nodes`. + +## kubeadm_join_command +> Run output on worker nodes to add them to the cluster. + +When executed on the initial master node, prints the command that needs to be executed on a system for it to be added to the cluster as a worker node. + +## reset_kubeadm +Effectively removes a node from the cluster. Runs the `kubeadm reset` command and does some further cleanup. + +## perma_disable_swap +Permanently disables swap by commenting out every line in the `/etc/fstab` file containing the word +" swap ". Not actually used as this functionality is included in the other scripts that need it. + +## setup_networking +Makes changes to networking settings. Not used in the cluster setup. Left in just in case for future use. + + +# GPU +## enable_gpu +> Execute on a node for it to be able to run GPU workloads. + +After running this scripts also run `kubectl describe node ` and look for the "Allocatable" section. If it has the `nvidia.com/gpu` with a value of "1" or more, like this: +``` +Allocatable: + cpu: 12 + ephemeral-storage: 423797574979 + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 32701280Ki + nvidia.com/gpu: 1 +``` +then the installation was successful. Bear in mind that even after correctly installing all the necessary packages and plugins it can take some time until Kubernetes recognizes the GPU on a worker node. +This script has the combined functionality of all of the following gpu scripts. + +### Configure Docker or Containerd for GPU +Before the device plugin can function changes must be made to some files depending on the chosen container runtime on a node. In this installation process we are using containerd, but if you are unsure, run `kubectl get nodes -o wide` to list all nodes and check their configured container runtime. + + +**For containerd as a runtime (used in this whole installation process):** +Create or modify the file `/etc/containerd/config.toml` to contain the following: +``` +version = 2 +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" +``` +Afterwards run `sudo nvidia-ctk runtime configure --runtime=containerd` and restart the containerd service using `systemctl restart containerd`. + +--- + +**For docker as a runtime:** +Create or modify the file `/etc/docker/daemon.json` to contain the following: +``` +{ + "default-runtime": "nvidia", + "runtimes": { + "nvidia": { + "path": "/usr/bin/nvidia-container-runtime", + "runtimeArgs": [] + } + } +} +``` +Afterwards run `sudo nvidia-ctk runtime configure --runtime=docker` and restart the docker service using `systemctl restart docker`. + + +## helm_install +Installs helm, a package manager for Kubernetes. Helm is used to install the NVIDIA device plugin for Kubernetes. Run `helm version --short` to check the installation. + +## nvidia_container_toolkit_install +Installs the nvidia-container-tookit, needed by the NVIDIA device plugin for Kubernetes. Run `nvidia-ctk --version` to check the installation. + +## device_plugin_install +Installs the NVIDIA device plugin for Kubernetes using helm. Run `helm list -A` to check if the plugin was installed successfully. + + +# Possible errors +If there is a problem with a node, first try restarting the `docker`, `containerd` and `kubelet` services on that node by running `sudo systemctl restart docker containerd kubelet`. + +## Container runtime is not running +``` +root@kubemaster:~$ sudo kubeadm init + [ERROR CRI]: container runtime is not running: output: time="2022-05-20T02:06:28Z"level=fatal msg="getting status of runtime: rpc error: code = Unimplemented desc = unknown service runtime.v1alpha2.RuntimeService" +``` +If this error comes up while initializing the master node or adding a worker to the cluster, try restarting the `docker` and `containerd` services by running `systemctl restart docker containerd`. If this does not solve the problem delete the file `/etc/containerd/config.toml`, restart containerd and try again. + +## kubectl: Connection refused +The error "Connection refused" when running a kubectl command often means that swap is not turned off. +Turn off swap temporarily by running `sudo swapoff -a` or run the `perma_disable_swap.sh` script. +Some PCs turn swap back on after reboot, even after "permanently" turning it off by changing the `/etc/fstab` file. + +## Node does not have status READY +If a node does not change its status to READY after a while (typically max. 10 minutes) you can restart the services needed by Kubernetes by running `sudo systemctl restart docker containerd kubelet`. If that does not help, restart the system. If that also fails look at the events in the node by running `kubectl describe node ` and try to solve the problem from there. diff --git a/instructions/kubernetes/gpu/device_plugin_install.sh b/instructions/kubernetes/gpu/device_plugin_install.sh new file mode 100644 index 00000000..ea094159 --- /dev/null +++ b/instructions/kubernetes/gpu/device_plugin_install.sh @@ -0,0 +1,11 @@ +#! /usr/bin/bash +# Taken from https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm + +# Install NVIDIA device plugin for Kubernetes +helm repo add nvdp https://nvidia.github.io/k8s-device-plugin +helm repo update +helm upgrade -i nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --create-namespace \ + --version 0.14.1 + diff --git a/instructions/kubernetes/gpu/enable_gpu.sh b/instructions/kubernetes/gpu/enable_gpu.sh new file mode 100644 index 00000000..8ee19460 --- /dev/null +++ b/instructions/kubernetes/gpu/enable_gpu.sh @@ -0,0 +1,25 @@ +#! /usr/bin/bash +# Author: Filip Fitzermann + +# Install nvidia-container-toolkit. Taken from https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && \ + sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +# Install Helm. Taken from https://helm.sh/docs/intro/install/ +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 get_helm.sh +./get_helm.sh +rm get_helm.sh + +# Install NVIDIA device plugin for Kubernetes. Taken from https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm +helm repo add nvdp https://nvidia.github.io/k8s-device-plugin +helm repo update +helm upgrade -i nvdp nvdp/nvidia-device-plugin \ + --namespace nvidia-device-plugin \ + --create-namespace \ + --version 0.14.1 diff --git a/instructions/kubernetes/gpu/helm_install.sh b/instructions/kubernetes/gpu/helm_install.sh new file mode 100644 index 00000000..109e82c8 --- /dev/null +++ b/instructions/kubernetes/gpu/helm_install.sh @@ -0,0 +1,7 @@ +#! /usr/bin/bash +# Author: taken from https://helm.sh/docs/intro/install/ + +# Install helm, a package manager for Kubernetes +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 get_helm.sh +./get_helm.sh diff --git a/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh b/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh new file mode 100644 index 00000000..a7100fc7 --- /dev/null +++ b/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh @@ -0,0 +1,11 @@ +#! /usr/bin/bash +# Author: taken from https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt + +# Install nvidia-container-toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && \ + sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit diff --git a/instructions/kubernetes/init_master_node.sh b/instructions/kubernetes/init_master_node.sh new file mode 100644 index 00000000..2e656c9f --- /dev/null +++ b/instructions/kubernetes/init_master_node.sh @@ -0,0 +1,35 @@ +#! /usr/bin/bash +# Author: Markos Genios, Filip Fitzermann +# Assumes running on Ubuntu 20.04 + +# apt update +sudo apt update -y +sudo apt upgrade -y + +# Permanently disable swap. /etc/fstab +sudo swapoff -a +sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab + +# Install Docker +sudo apt install -y docker.io +sudo systemctl enable --now docker + +# Install auxiliary packages +sudo apt install -y apt-transport-https ca-certificates curl +curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - +echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list +sudo apt update + +# Install kubelet, kubeadm and kubectl and prevent them from being updated in the future +sudo apt install -y kubelet=1.27.3-00 kubeadm=1.27.3-00 kubectl=1.27.3-00 +sudo apt-mark hold kubelet kubeadm kubectl + +# Initialize control-plane also setting up the pod-network-cidr necessary for calico (pod network provider) +sudo kubeadm init --pod-network-cidr=192.168.0.0/16 + +# Follow instructions given by successful "kubeadm init" +mkdir -p $HOME/.kube +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +sudo chown $(id -u):$(id -g) $HOME/.kube/config +curl -LO https://github.com/projectcalico/calico/raw/master/manifests/calico.yaml +kubectl apply -f calico.yaml diff --git a/instructions/kubernetes/init_worker_node.sh b/instructions/kubernetes/init_worker_node.sh new file mode 100644 index 00000000..9d17128c --- /dev/null +++ b/instructions/kubernetes/init_worker_node.sh @@ -0,0 +1,26 @@ +#! /usr/bin/bash +# Author: Markos Genios, Filip Fitzermann +# Assumes running on Ubuntu 20.04 + + +# apt update +sudo apt update -y +sudo apt upgrade -y + +# Permanently disable swap +sudo swapoff -a +sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab + +# Install Docker +sudo apt install -y docker.io +sudo systemctl enable --now docker + +# Install auxiliary packages +sudo apt install -y apt-transport-https ca-certificates curl +curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - +echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list +sudo apt update + +# Install kubelet, kubeadm and kubectl and prevent them from being updated in the future +sudo apt install -y kubelet=1.27.3-00 kubeadm=1.27.3-00 kubectl=1.27.3-00 +sudo apt-mark hold kubelet kubeadm kubectl diff --git a/instructions/kubernetes/kubeadm_join_command.sh b/instructions/kubernetes/kubeadm_join_command.sh new file mode 100644 index 00000000..6f1ff377 --- /dev/null +++ b/instructions/kubernetes/kubeadm_join_command.sh @@ -0,0 +1,3 @@ +#! /usr/bin/bash + +kubeadm token create --print-join-command diff --git a/instructions/kubernetes/perma_disable_swap.sh b/instructions/kubernetes/perma_disable_swap.sh new file mode 100644 index 00000000..13f0ca87 --- /dev/null +++ b/instructions/kubernetes/perma_disable_swap.sh @@ -0,0 +1,5 @@ +#! /usr/bin/bash +# Author: Filip Fitzermann + +sudo swapoff -a +sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab diff --git a/instructions/kubernetes/reset_kubeadm.sh b/instructions/kubernetes/reset_kubeadm.sh new file mode 100644 index 00000000..228b311e --- /dev/null +++ b/instructions/kubernetes/reset_kubeadm.sh @@ -0,0 +1,7 @@ + #! /usr/bin/bash + # Author: Filip Fitzermann + + # Does not reset iptables + sudo kubeadm reset + sudo rm -r /etc/cni/net.d + sudo rm -r $HOME/.kube diff --git a/instructions/kubernetes/setup_networking.sh b/instructions/kubernetes/setup_networking.sh new file mode 100644 index 00000000..8775561c --- /dev/null +++ b/instructions/kubernetes/setup_networking.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash +# Author: Filip Fitzermann + +cat <