From 433cd65d08cbfd44e906ee97623a898273b6ef06 Mon Sep 17 00:00:00 2001
From: Giuseppe Abrami <abrami@em.uni-frankfurt.de>
Date: Sun, 26 Nov 2023 17:31:50 +0100
Subject: [PATCH] Remove Debug Test-Classes

---
 instructions/kubernetes/README.md             | 126 ++++++++++++++++++
 .../kubernetes/gpu/device_plugin_install.sh   |  11 ++
 instructions/kubernetes/gpu/enable_gpu.sh     |  25 ++++
 instructions/kubernetes/gpu/helm_install.sh   |   7 +
 .../gpu/nvidia_container_toolkit_install.sh   |  11 ++
 instructions/kubernetes/init_master_node.sh   |  35 +++++
 instructions/kubernetes/init_worker_node.sh   |  26 ++++
 .../kubernetes/kubeadm_join_command.sh        |   3 +
 instructions/kubernetes/perma_disable_swap.sh |   5 +
 instructions/kubernetes/reset_kubeadm.sh      |   7 +
 instructions/kubernetes/setup_networking.sh   |  28 ++++
 11 files changed, 284 insertions(+)
 create mode 100644 instructions/kubernetes/README.md
 create mode 100644 instructions/kubernetes/gpu/device_plugin_install.sh
 create mode 100644 instructions/kubernetes/gpu/enable_gpu.sh
 create mode 100644 instructions/kubernetes/gpu/helm_install.sh
 create mode 100644 instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh
 create mode 100644 instructions/kubernetes/init_master_node.sh
 create mode 100644 instructions/kubernetes/init_worker_node.sh
 create mode 100644 instructions/kubernetes/kubeadm_join_command.sh
 create mode 100644 instructions/kubernetes/perma_disable_swap.sh
 create mode 100644 instructions/kubernetes/reset_kubeadm.sh
 create mode 100644 instructions/kubernetes/setup_networking.sh

diff --git a/instructions/kubernetes/README.md b/instructions/kubernetes/README.md
new file mode 100644
index 00000000..d7256f1c
--- /dev/null
+++ b/instructions/kubernetes/README.md
@@ -0,0 +1,126 @@
+# Simple Kubernetes Install Scripts
+These shell-scripts were made and used for an easier installation of a Kubernetes cluster on a network of Ubuntu 20.04 systems. I recommend to start on a fresh installation of Ubuntu 20.04.
+
+# TLDR
+- **Create cluster**: Run `init_master_node.sh` to create the cluster and make the system a master node.
+- **Add worker node**: Run `init_worker_node.sh` to prepare the system to be added as a worker node. Run `kubeadm_join_command.sh` on the master node and execute output on the to be worker node.
+- **GPU capabilities**: Run `enable_gpu.sh` on a worker node with an NVIDIA GPU to make its gpu usable by the cluster. Change files depending on the chosen container runtime; more on this in the section [Configure Docker or Containerd](#configure-docker-or-containerd-for-gpu).
+
+
+
+# Basic Cluster setup
+## init_master_node
+> Run on a system to initialze a cluster and make it a master node.
+
+Installs Kubernetes packages and makes the system a master node (control plane). Works only for the first master node. For adding consecutive master nodes to the same cluster, the procedure is different.
+Installs calico as the pod networking plugin.
+Permanently disables swap on the system.
+
+## init_worker_node
+> Run on a system that will be a worker node.
+
+Installs Kubernetes packages so the node is ready to be added to the cluster as a worker node. To add the system as a worker node run `kubeadm token create --print-join-command` on the master node and execute the output on the system to be added as a worker node.
+On the worker node, check if the node has been successfully added by running `kubectl get nodes`.
+
+## kubeadm_join_command
+> Run output on worker nodes to add them to the cluster.
+
+When executed on the initial master node, prints the command that needs to be executed on a system for it to be added to the cluster as a worker node.
+
+## reset_kubeadm
+Effectively removes a node from the cluster. Runs the `kubeadm reset` command and does some further cleanup.
+
+## perma_disable_swap
+Permanently disables swap by commenting out every line in the `/etc/fstab` file containing the word
+" swap ". Not actually used as this functionality is included in the other scripts that need it.
+
+## setup_networking
+Makes changes to networking settings. Not used in the cluster setup. Left in just in case for future use.
+
+
+# GPU
+## enable_gpu
+> Execute on a node for it to be able to run GPU workloads.
+
+After running this scripts also run `kubectl describe node <gpu-node-name>` and look for the "Allocatable" section. If it has the `nvidia.com/gpu` with a value of "1" or more, like this:
+```
+Allocatable:
+  cpu:                12
+  ephemeral-storage:  423797574979
+  hugepages-1Gi:      0
+  hugepages-2Mi:      0
+  memory:             32701280Ki
+  nvidia.com/gpu:     1
+```
+then the installation was successful. Bear in mind that even after correctly installing all the necessary packages and plugins it can take some time until Kubernetes recognizes the GPU on a worker node.
+This script has the combined functionality of all of the following gpu scripts.
+
+### Configure Docker or Containerd for GPU
+Before the device plugin can function changes must be made to some files depending on the chosen container runtime on a node. In this installation process we are using containerd, but if you are unsure, run `kubectl get nodes -o wide` to list all nodes and check their configured container runtime.
+
+
+**For containerd as a runtime (used in this whole installation process):**
+Create or modify the file `/etc/containerd/config.toml` to contain the following:
+```
+version = 2
+[plugins]
+  [plugins."io.containerd.grpc.v1.cri"]
+    [plugins."io.containerd.grpc.v1.cri".containerd]
+      default_runtime_name = "nvidia"
+
+      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
+          privileged_without_host_devices = false
+          runtime_engine = ""
+          runtime_root = ""
+          runtime_type = "io.containerd.runc.v2"
+          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
+            BinaryName = "/usr/bin/nvidia-container-runtime"
+```
+Afterwards run `sudo nvidia-ctk runtime configure --runtime=containerd` and restart the containerd service using `systemctl restart containerd`.
+
+---
+
+**For docker as a runtime:**
+Create or modify the file `/etc/docker/daemon.json` to contain the following:
+```
+{
+    "default-runtime": "nvidia",
+    "runtimes": {
+        "nvidia": {
+            "path": "/usr/bin/nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
+```
+Afterwards run `sudo nvidia-ctk runtime configure --runtime=docker` and restart the docker service using `systemctl restart docker`.
+
+
+## helm_install
+Installs helm, a package manager for Kubernetes. Helm is used to install the NVIDIA device plugin for Kubernetes. Run `helm version --short` to check the installation.
+
+## nvidia_container_toolkit_install
+Installs the nvidia-container-tookit, needed by the NVIDIA device plugin for Kubernetes. Run `nvidia-ctk --version` to check the installation.
+
+## device_plugin_install
+Installs the NVIDIA device plugin for Kubernetes using helm. Run `helm list -A` to check if the plugin was installed successfully.
+
+
+# Possible errors
+If there is a problem with a node, first try restarting the `docker`, `containerd` and `kubelet` services on that node by running `sudo systemctl restart docker containerd kubelet`.
+
+## Container runtime is not running
+```
+root@kubemaster:~$ sudo kubeadm init
+    [ERROR CRI]: container runtime is not running: output: time="2022-05-20T02:06:28Z"level=fatal msg="getting status of runtime: rpc error: code = Unimplemented desc = unknown service runtime.v1alpha2.RuntimeService"
+```
+If this error comes up while initializing the master node or adding a worker to the cluster, try restarting the `docker` and `containerd` services by running `systemctl restart docker containerd`. If this does not solve the problem delete the file `/etc/containerd/config.toml`, restart containerd and try again.
+
+## kubectl: Connection refused
+The error "Connection refused" when running a kubectl command often means that swap is not turned off.
+Turn off swap temporarily by running `sudo swapoff -a` or run the `perma_disable_swap.sh` script.
+Some PCs turn swap back on after reboot, even after "permanently" turning it off by changing the `/etc/fstab` file.
+
+## Node does not have status READY
+If a node does not change its status to READY after a while (typically max. 10 minutes) you can restart the services needed by Kubernetes by running `sudo systemctl restart docker containerd kubelet`. If that does not help, restart the system. If that also fails look at the events in the node by running `kubectl describe node <node-name>` and try to solve the problem from there.
diff --git a/instructions/kubernetes/gpu/device_plugin_install.sh b/instructions/kubernetes/gpu/device_plugin_install.sh
new file mode 100644
index 00000000..ea094159
--- /dev/null
+++ b/instructions/kubernetes/gpu/device_plugin_install.sh
@@ -0,0 +1,11 @@
+#! /usr/bin/bash
+# Taken from https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm
+
+# Install NVIDIA device plugin for Kubernetes
+helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+helm repo update
+helm upgrade -i nvdp nvdp/nvidia-device-plugin \
+	--namespace nvidia-device-plugin \
+	--create-namespace \
+	--version 0.14.1
+
diff --git a/instructions/kubernetes/gpu/enable_gpu.sh b/instructions/kubernetes/gpu/enable_gpu.sh
new file mode 100644
index 00000000..8ee19460
--- /dev/null
+++ b/instructions/kubernetes/gpu/enable_gpu.sh
@@ -0,0 +1,25 @@
+#! /usr/bin/bash
+# Author: Filip Fitzermann
+
+# Install nvidia-container-toolkit. Taken from https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+  && \
+    sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
+
+# Install Helm. Taken from https://helm.sh/docs/intro/install/
+curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
+chmod 700 get_helm.sh
+./get_helm.sh
+rm get_helm.sh
+
+# Install NVIDIA device plugin for Kubernetes. Taken from https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm
+helm repo add nvdp https://nvidia.github.io/k8s-device-plugin
+helm repo update
+helm upgrade -i nvdp nvdp/nvidia-device-plugin \
+	--namespace nvidia-device-plugin \
+	--create-namespace \
+	--version 0.14.1
diff --git a/instructions/kubernetes/gpu/helm_install.sh b/instructions/kubernetes/gpu/helm_install.sh
new file mode 100644
index 00000000..109e82c8
--- /dev/null
+++ b/instructions/kubernetes/gpu/helm_install.sh
@@ -0,0 +1,7 @@
+#! /usr/bin/bash
+# Author: taken from https://helm.sh/docs/intro/install/
+
+# Install helm, a package manager for Kubernetes
+curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
+chmod 700 get_helm.sh
+./get_helm.sh
diff --git a/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh b/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh
new file mode 100644
index 00000000..a7100fc7
--- /dev/null
+++ b/instructions/kubernetes/gpu/nvidia_container_toolkit_install.sh
@@ -0,0 +1,11 @@
+#! /usr/bin/bash
+# Author: taken from https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt
+
+# Install nvidia-container-toolkit
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+  && \
+    sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
diff --git a/instructions/kubernetes/init_master_node.sh b/instructions/kubernetes/init_master_node.sh
new file mode 100644
index 00000000..2e656c9f
--- /dev/null
+++ b/instructions/kubernetes/init_master_node.sh
@@ -0,0 +1,35 @@
+#! /usr/bin/bash
+# Author: Markos Genios, Filip Fitzermann
+# Assumes running on Ubuntu 20.04
+
+# apt update
+sudo apt update -y
+sudo apt upgrade -y
+
+# Permanently disable swap. /etc/fstab
+sudo swapoff -a
+sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
+
+# Install Docker
+sudo apt install -y docker.io
+sudo systemctl enable --now docker
+
+# Install auxiliary packages
+sudo apt install -y apt-transport-https ca-certificates curl
+curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
+echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
+sudo apt update
+
+# Install kubelet, kubeadm and kubectl and prevent them from being updated in the future
+sudo apt install -y kubelet=1.27.3-00 kubeadm=1.27.3-00 kubectl=1.27.3-00
+sudo apt-mark hold kubelet kubeadm kubectl
+
+# Initialize control-plane also setting up the pod-network-cidr necessary for calico (pod network provider)
+sudo kubeadm init --pod-network-cidr=192.168.0.0/16
+
+# Follow instructions given by successful "kubeadm init"
+mkdir -p $HOME/.kube
+sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
+sudo chown $(id -u):$(id -g) $HOME/.kube/config
+curl -LO https://github.com/projectcalico/calico/raw/master/manifests/calico.yaml
+kubectl apply -f calico.yaml
diff --git a/instructions/kubernetes/init_worker_node.sh b/instructions/kubernetes/init_worker_node.sh
new file mode 100644
index 00000000..9d17128c
--- /dev/null
+++ b/instructions/kubernetes/init_worker_node.sh
@@ -0,0 +1,26 @@
+#! /usr/bin/bash
+# Author: Markos Genios, Filip Fitzermann
+# Assumes running on Ubuntu 20.04
+
+
+# apt update
+sudo apt update -y
+sudo apt upgrade -y
+
+# Permanently disable swap
+sudo swapoff -a
+sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
+
+# Install Docker
+sudo apt install -y docker.io
+sudo systemctl enable --now docker
+
+# Install auxiliary packages
+sudo apt install -y apt-transport-https ca-certificates curl
+curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
+echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
+sudo apt update
+
+# Install kubelet, kubeadm and kubectl and prevent them from being updated in the future
+sudo apt install -y kubelet=1.27.3-00 kubeadm=1.27.3-00 kubectl=1.27.3-00
+sudo apt-mark hold kubelet kubeadm kubectl
diff --git a/instructions/kubernetes/kubeadm_join_command.sh b/instructions/kubernetes/kubeadm_join_command.sh
new file mode 100644
index 00000000..6f1ff377
--- /dev/null
+++ b/instructions/kubernetes/kubeadm_join_command.sh
@@ -0,0 +1,3 @@
+#! /usr/bin/bash
+
+kubeadm token create --print-join-command
diff --git a/instructions/kubernetes/perma_disable_swap.sh b/instructions/kubernetes/perma_disable_swap.sh
new file mode 100644
index 00000000..13f0ca87
--- /dev/null
+++ b/instructions/kubernetes/perma_disable_swap.sh
@@ -0,0 +1,5 @@
+#! /usr/bin/bash
+# Author: Filip Fitzermann
+
+sudo swapoff -a
+sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
diff --git a/instructions/kubernetes/reset_kubeadm.sh b/instructions/kubernetes/reset_kubeadm.sh
new file mode 100644
index 00000000..228b311e
--- /dev/null
+++ b/instructions/kubernetes/reset_kubeadm.sh
@@ -0,0 +1,7 @@
+ #! /usr/bin/bash
+ # Author: Filip Fitzermann
+
+ # Does not reset iptables
+ sudo kubeadm reset
+ sudo rm -r /etc/cni/net.d
+ sudo rm -r $HOME/.kube
diff --git a/instructions/kubernetes/setup_networking.sh b/instructions/kubernetes/setup_networking.sh
new file mode 100644
index 00000000..8775561c
--- /dev/null
+++ b/instructions/kubernetes/setup_networking.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/bash
+# Author: Filip Fitzermann
+
+cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
+overlay
+br_netfilter
+EOF
+
+sudo modprobe overlay
+sudo modprobe br_netfilter
+
+# sysctl params required by setup, params persist across reboots
+cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
+net.bridge.bridge-nf-call-iptables  = 1
+net.bridge.bridge-nf-call-ip6tables = 1
+net.ipv4.ip_forward                 = 1
+EOF
+
+# Apply sysctl params without reboot
+sudo sysctl --system
+
+
+# Verify that the br_netfilter, overlay modules are loaded
+lsmod | grep br_netfilter
+lsmod | grep overlay
+
+# Verify that the net.bridge.bridge-nf-call-iptables, net.bridge.bridge-nf-call-ip6tables, and net.ipv4.ip_forward system variables are set to 1 in your sysctl config
+sysctl net.bridge.bridge-nf-call-iptables net.bridge.bridge-nf-call-ip6tables net.ipv4.ip_forward