diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 8ce0698..516e388 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,37 +1,26 @@ -name: Release Charts - -on: - push: - branches: - - main - +name: Publish charts +# Run the tasks on every push +on: push jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 + submodules: true - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/README.md b/README.md index 2edf8a0..46db25a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Slurm Docker Cluster -This is a multi-container Slurm cluster using Kubernetes. The Helm chart -creates a named volume for persistent storage of MySQL data files as well as -an NFS volume for shared storage. +This is a multi-container Slurm cluster using Kubernetes. The Slurm cluster Helm chart creates a named volume for persistent storage of MySQL data files. By default, it also installs the +RookNFS Helm chart (also in this repo) to provide shared storage across the Slurm cluster nodes. ## Dependencies @@ -27,12 +26,11 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the scripts in the `/nfs` directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster"). ## Configuring the Cluster -All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). -Additional parameters can be found in the `values.yaml` file, which will be applied on a Helm chart deployment. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). +All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). Additional parameters can be found in the `values.yaml` file for the Helm chart. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). ## Deploying the Cluster @@ -40,25 +38,26 @@ Additional parameters can be found in the `values.yaml` file, which will be appl On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh [] ``` -This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" + +Be sure to take note of the Open Ondemand credentials, you will need them to access the cluster through a browser ### Connecting RWX Volume -A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running -```console -./nfs/deploy-nfs.sh -``` -and leaving `nfs.claimName` as the provided value. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. + +See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. ### Supplying Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh [] ``` +where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. ### Deploying with Helm @@ -66,12 +65,20 @@ After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the c ```console helm install slurm-cluster-chart ``` + +NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart +```console +helm dependency update slurm-cluster-chart +``` + Subsequent releases can be deployed using: ```console helm upgrade slurm-cluster-chart ``` +Note: When updating the cluster with `helm upgrade`, a pre-upgrade hook will prevent upgrades if there are running jobs in the Slurm queue. Attempting to upgrade will set all Slurm nodes to `DRAINED` state. If an upgrade fails due to running jobs, you can undrain the nodes either by waiting for running jobs to complete and then retrying the upgrade or by manually undraining them by accessing the cluster as a privileged user. Alternatively you can bypass the hook by running `helm upgrade` with the `--no-hooks` flag (may result in running jobs being lost) + ## Accessing the Cluster Retrieve the external IP address of the login node using: @@ -128,6 +135,7 @@ srun singularity exec docker://ghcr.io/stackhpc/mpitests-container:${MPI_CONTAIN ``` Note: The mpirun script assumes you are running as user 'rocky'. If you are running as root, you will need to include the --allow-run-as-root argument + ## Reconfiguring the Cluster ### Changes to config files @@ -171,3 +179,5 @@ and then restart the other dependent deployments to propagate changes: ```console kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` + +# Known Issues diff --git a/generate-secrets.sh b/generate-secrets.sh deleted file mode 100755 index db64a53..0000000 --- a/generate-secrets.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -kubectl create secret generic database-auth-secret \ ---dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ --o yaml | \ -kubectl apply -f - \ No newline at end of file diff --git a/image/Dockerfile b/image/Dockerfile index 167584b..0b07c30 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -9,12 +9,17 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- ARG SLURM_TAG=slurm-23.02 ARG GOSU_VERSION=1.11 +COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo + RUN set -ex \ && yum makecache \ && yum -y update \ && yum -y install dnf-plugins-core epel-release \ && yum -y install dnf-plugins-core \ && yum config-manager --set-enabled powertools \ + && yum -y module enable ruby:2.7 nodejs:14 \ + && yum -y install https://yum.osc.edu/ondemand/2.0/ondemand-release-web-2.0-1.noarch.rpm \ + && yum -y module install ruby nodejs \ && yum -y install \ wget \ bzip2 \ @@ -42,6 +47,8 @@ RUN set -ex \ hwloc-devel \ openssh-server \ apptainer \ + ondemand \ + kubectl \ && yum clean all \ && rm -rf /var/cache/yum @@ -93,6 +100,7 @@ RUN mkdir /etc/sysconfig/slurm \ VOLUME /etc/slurm COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] CMD ["slurmdbd"] diff --git a/image/docker-entrypoint.sh b/image/docker-entrypoint.sh index 132c554..e80e96d 100755 --- a/image/docker-entrypoint.sh +++ b/image/docker-entrypoint.sh @@ -48,7 +48,7 @@ then done echo "-- slurmdbd is now active ..." - echo "---> Setting permissions for state directory ..." + echo "---> Setting ownership for state directory ..." chown slurm:slurm /var/spool/slurmctld echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." @@ -86,6 +86,8 @@ then chown root:root /home chmod 755 /home + echo "---> Setting up ssh for user" + mkdir -p /home/rocky/.ssh cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys @@ -99,25 +101,75 @@ then done popd > /dev/null + echo "---> Complete" echo "---> Starting sshd" - ssh-keygen -A + cp /tempmounts/etc/ssh/* /etc/ssh/ + chmod 600 /etc/ssh/ssh_host_dsa_key + chmod 600 /etc/ssh/ssh_host_ecdsa_key + chmod 600 /etc/ssh/ssh_host_ed25519_key + chmod 600 /etc/ssh/ssh_host_rsa_key /usr/sbin/sshd - start_munge --foreground + start_munge + + echo "---> Setting up self ssh capabilities for OOD" + + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + chown rocky:rocky /home/rocky/.ssh/id_rsa /home/rocky/.ssh/id_rsa.pub + fi + + ssh-keyscan localhost > /etc/ssh/ssh_known_hosts + echo "" >> /home/rocky/.ssh/authorized_keys #Adding newline to avoid breaking authorized_keys file + cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys + + echo "---> Starting Apache Server" + + # mkdir --parents /etc/ood/config/apps/shell + # env > /etc/ood/config/apps/shell/env + + /usr/libexec/httpd-ssl-gencerts + /opt/ood/ood-portal-generator/sbin/update_ood_portal + mkdir --parents /opt/rh/httpd24/root/etc/httpd/ + + /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky $ROCKY_OOD_PASS + /usr/sbin/httpd -k start -X -e debug elif [ "$1" = "check-queue-hook" ] then start_munge + scontrol update NodeName=all State=DRAIN Reason="Preventing new jobs running before upgrade" + RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then - exit 0 + exit 0 else - exit 1 + exit 1 fi +elif [ "$1" = "undrain-nodes-hook" ] +then + start_munge + scontrol update NodeName=all State=UNDRAIN + exit 0 + +elif [ "$1" = "generate-keys-hook" ] +then + mkdir -p ./temphostkeys/etc/ssh + ssh-keygen -A -f ./temphostkeys + kubectl create secret generic host-keys-secret \ + --dry-run=client \ + --from-file=./temphostkeys/etc/ssh \ + -o yaml | \ + kubectl apply -f - + + exit 0 + elif [ "$1" = "debug" ] then start_munge --foreground diff --git a/image/k8s-slurmd-create b/image/k8s-slurmd-create new file mode 100644 index 0000000..4a99918 --- /dev/null +++ b/image/k8s-slurmd-create @@ -0,0 +1,16 @@ +#!/usr/bin/bash + +echo "$(date) Resume invoked $0 $*" &>> /var/log/slurm/power_save.log + +APISERVER=https://kubernetes.default.svc +SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount +NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) +TOKEN=$(cat ${SERVICEACCOUNT}/token) +CACERT=${SERVICEACCOUNT}/ca.crt + +hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes +for host in $hosts +do + ( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | \ + kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT create -f - ) +done \ No newline at end of file diff --git a/image/k8s-slurmd-delete b/image/k8s-slurmd-delete new file mode 100644 index 0000000..da4e438 --- /dev/null +++ b/image/k8s-slurmd-delete @@ -0,0 +1,15 @@ +#!/usr/bin/bash + +echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log + +APISERVER=https://kubernetes.default.svc +SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount +NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace) +TOKEN=$(cat ${SERVICEACCOUNT}/token) +CACERT=${SERVICEACCOUNT}/ca.crt + +hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes +for host in $hosts +do + kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT delete pod $host +done diff --git a/image/kubernetes.repo b/image/kubernetes.repo new file mode 100644 index 0000000..f4ae4ff --- /dev/null +++ b/image/kubernetes.repo @@ -0,0 +1,6 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch +enabled=1 +gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index b2d2f75..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f nfs/crds.yaml -kubectl create -f nfs/operator.yaml -kubectl create -f nfs/rbac.yaml -kubectl create -f nfs/nfs.yaml -kubectl create -f nfs/sc.yaml -kubectl create -f nfs/pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/publish-keys.sh b/publish-keys.sh index d293e81..bdd4e0f 100755 --- a/publish-keys.sh +++ b/publish-keys.sh @@ -1,3 +1,8 @@ -kubectl create configmap authorized-keys-configmap \ +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi +echo Installing in namespace $NAMESPACE +kubectl -n $NAMESPACE create configmap authorized-keys-configmap \ "--from-literal=authorized_keys=$(cat ~/.ssh/*.pub)" --dry-run=client -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..b8abd25 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: A packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..5b7ad6d --- /dev/null +++ b/rooknfs/README.md @@ -0,0 +1,3 @@ +# RookNFS Helm Chart + +See `values.yaml` for available config options. \ No newline at end of file diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/rooknfs/templates/hooks/pre-delete.yaml b/rooknfs/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..2c75c89 --- /dev/null +++ b/rooknfs/templates/hooks/pre-delete.yaml @@ -0,0 +1,50 @@ +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Values.serverNamespace }} nfsservers {{ .Values.serverName }} --wait + restartPolicy: Never +--- \ No newline at end of file diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 56% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..cf7b1de 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -3,30 +3,34 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName }} + namespace: {{ .Values.serverNamespace }} spec: + {{- if .Values.backingStorageClass }} + storageClassName: {{ .Values.backingStorageClass }} + {{- end }} accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 92% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..56318f6 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,14 @@ +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +21,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +107,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +135,4 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 90% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..422a43b 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -2,13 +2,13 @@ apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +51,9 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- \ No newline at end of file diff --git a/nfs/sc.yaml b/rooknfs/templates/sc.yaml similarity index 52% rename from nfs/sc.yaml rename to rooknfs/templates/sc.yaml index 6f9e3ae..505bd44 100644 --- a/nfs/sc.yaml +++ b/rooknfs/templates/sc.yaml @@ -1,13 +1,15 @@ +--- apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: labels: app: rook-nfs - name: rook-nfs-share1 + name: {{ .Values.storageClassName }} parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} provisioner: nfs.rook.io/rook-nfs-provisioner reclaimPolicy: Delete volumeBindingMode: Immediate +--- \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..4ada627 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,28 @@ + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Storage class to use for the Read-Write-Once backing PVC +backingStorageClass: + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..e3d003c 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,10 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: ">=0-0" + repository: file://../rooknfs + condition: rooknfs.enabled diff --git a/slurm-cluster-chart/files/httpd.conf b/slurm-cluster-chart/files/httpd.conf new file mode 100644 index 0000000..248afb2 --- /dev/null +++ b/slurm-cluster-chart/files/httpd.conf @@ -0,0 +1,356 @@ +# Modified from file installed by httpd package +# This is the main Apache HTTP server configuration file. It contains the +# configuration directives that give the server its instructions. +# See for detailed information. +# In particular, see +# +# for a discussion of each configuration directive. +# +# See the httpd.conf(5) man page for more information on this configuration, +# and httpd.service(8) on using and configuring the httpd service. +# +# Do NOT simply read the instructions in here without understanding +# what they do. They're here only as hints or reminders. If you are unsure +# consult the online docs. You have been warned. +# +# Configuration and logfile names: If the filenames you specify for many +# of the server's control files begin with "/" (or "drive:/" for Win32), the +# server will use that explicit path. If the filenames do *not* begin +# with "/", the value of ServerRoot is prepended -- so 'log/access_log' +# with ServerRoot set to '/www' will be interpreted by the +# server as '/www/log/access_log', where as '/log/access_log' will be +# interpreted as '/log/access_log'. + +# +# ServerRoot: The top of the directory tree under which the server's +# configuration, error, and log files are kept. +# +# Do not add a slash at the end of the directory path. If you point +# ServerRoot at a non-local disk, be sure to specify a local disk on the +# Mutex directive, if file-based mutexes are used. If you wish to share the +# same ServerRoot for multiple httpd daemons, you will need to change at +# least PidFile. +# +ServerRoot "/etc/httpd" + +# +# Listen: Allows you to bind Apache to specific IP addresses and/or +# ports, instead of the default. See also the +# directive. +# +# Change this to Listen on specific IP addresses as shown below to +# prevent Apache from glomming onto all bound IP addresses. +# +#Listen 12.34.56.78:80 +Listen 80 + +# +# Dynamic Shared Object (DSO) Support +# +# To be able to use the functionality of a module which was built as a DSO you +# have to place corresponding `LoadModule' lines at this location so the +# directives contained in it are actually available _before_ they are used. +# Statically compiled modules (those listed by `httpd -l') do not need +# to be loaded here. +# +# Example: +# LoadModule foo_module modules/mod_foo.so +# +Include conf.modules.d/*.conf + +# +# If you wish httpd to run as a different user or group, you must run +# httpd as root initially and it will switch. +# +# User/Group: The name (or #number) of the user/group to run httpd as. +# It is usually good practice to create a dedicated user and group for +# running httpd, as with most system services. +# +User apache +Group apache + +# 'Main' server configuration +# +# The directives in this section set up the values used by the 'main' +# server, which responds to any requests that aren't handled by a +# definition. These values also provide defaults for +# any containers you may define later in the file. +# +# All of these directives may appear inside containers, +# in which case these default settings will be overridden for the +# virtual host being defined. +# + +# +# ServerAdmin: Your address, where problems with the server should be +# e-mailed. This address appears on some server-generated pages, such +# as error documents. e.g. admin@your-domain.com +# +ServerAdmin root@localhost + +# +# ServerName gives the name and port that the server uses to identify itself. +# This can often be determined automatically, but we recommend you specify +# it explicitly to prevent problems during startup. +# +# If your host doesn't have a registered DNS name, enter its IP address here. +# +#ServerName www.example.com:80 + +# +# Deny access to the entirety of your server's filesystem. You must +# explicitly permit access to web content directories in other +# blocks below. +# + + AllowOverride none + Require all denied + + +# +# Note that from this point forward you must specifically allow +# particular features to be enabled - so if something's not working as +# you might expect, make sure that you have specifically enabled it +# below. +# + +# +# DocumentRoot: The directory out of which you will serve your +# documents. By default, all requests are taken from this directory, but +# symbolic links and aliases may be used to point to other locations. +# +DocumentRoot "/var/www/html" + +# +# Relax access to content within /var/www. +# + + AllowOverride None + # Allow open access: + Require all granted + + +# Further relax access to the default document root: + + # + # Possible values for the Options directive are "None", "All", + # or any combination of: + # Indexes Includes FollowSymLinks SymLinksifOwnerMatch ExecCGI MultiViews + # + # Note that "MultiViews" must be named *explicitly* --- "Options All" + # doesn't give it to you. + # + # The Options directive is both complicated and important. Please see + # http://httpd.apache.org/docs/2.4/mod/core.html#options + # for more information. + # + Options Indexes FollowSymLinks + + # + # AllowOverride controls what directives may be placed in .htaccess files. + # It can be "All", "None", or any combination of the keywords: + # Options FileInfo AuthConfig Limit + # + AllowOverride None + + # + # Controls who can get stuff from this server. + # + Require all granted + + +# +# DirectoryIndex: sets the file that Apache will serve if a directory +# is requested. +# + + DirectoryIndex index.html + + +# +# The following lines prevent .htaccess and .htpasswd files from being +# viewed by Web clients. +# + + Require all denied + + +# +# ErrorLog: The location of the error log file. +# If you do not specify an ErrorLog directive within a +# container, error messages relating to that virtual host will be +# logged here. If you *do* define an error logfile for a +# container, that host's errors will be logged there and not here. +# +ErrorLog "logs/error_log" + +# +# LogLevel: Control the number of messages logged to the error_log. +# Possible values include: debug, info, notice, warn, error, crit, +# alert, emerg. +# +LogLevel debug + + + # + # The following directives define some format nicknames for use with + # a CustomLog directive (see below). + # + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + + # You need to enable mod_logio.c to use %I and %O + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio + + + # + # The location and format of the access logfile (Common Logfile Format). + # If you do not define any access logfiles within a + # container, they will be logged here. Contrariwise, if you *do* + # define per- access logfiles, transactions will be + # logged therein and *not* in this file. + # + #CustomLog "logs/access_log" common + + # + # If you prefer a logfile with access, agent, and referer information + # (Combined Logfile Format) you can use the following directive. + # + CustomLog "logs/access_log" combined + + + + # + # Redirect: Allows you to tell clients about documents that used to + # exist in your server's namespace, but do not anymore. The client + # will make a new request for the document at its new location. + # Example: + # Redirect permanent /foo http://www.example.com/bar + + # + # Alias: Maps web paths into filesystem paths and is used to + # access content that does not live under the DocumentRoot. + # Example: + # Alias /webpath /full/filesystem/path + # + # If you include a trailing / on /webpath then the server will + # require it to be present in the URL. You will also likely + # need to provide a section to allow access to + # the filesystem path. + + # + # ScriptAlias: This controls which directories contain server scripts. + # ScriptAliases are essentially the same as Aliases, except that + # documents in the target directory are treated as applications and + # run by the server when requested rather than as documents sent to the + # client. The same rules about trailing "/" apply to ScriptAlias + # directives as to Alias. + # + ScriptAlias /cgi-bin/ "/var/www/cgi-bin/" + + + +# +# "/var/www/cgi-bin" should be changed to whatever your ScriptAliased +# CGI directory exists, if you have that configured. +# + + AllowOverride None + Options None + Require all granted + + + + # + # TypesConfig points to the file containing the list of mappings from + # filename extension to MIME-type. + # + TypesConfig /etc/mime.types + + # + # AddType allows you to add to or override the MIME configuration + # file specified in TypesConfig for specific file types. + # + #AddType application/x-gzip .tgz + # + # AddEncoding allows you to have certain browsers uncompress + # information on the fly. Note: Not all browsers support this. + # + #AddEncoding x-compress .Z + #AddEncoding x-gzip .gz .tgz + # + # If the AddEncoding directives above are commented-out, then you + # probably should define those extensions to indicate media types: + # + AddType application/x-compress .Z + AddType application/x-gzip .gz .tgz + + # + # AddHandler allows you to map certain file extensions to "handlers": + # actions unrelated to filetype. These can be either built into the server + # or added with the Action directive (see below) + # + # To use CGI scripts outside of ScriptAliased directories: + # (You will also need to add "ExecCGI" to the "Options" directive.) + # + #AddHandler cgi-script .cgi + + # For type maps (negotiated resources): + #AddHandler type-map var + + # + # Filters allow you to process content before it is sent to the client. + # + # To parse .shtml files for server-side includes (SSI): + # (You will also need to add "Includes" to the "Options" directive.) + # + AddType text/html .shtml + AddOutputFilter INCLUDES .shtml + + +# +# Specify a default charset for all content served; this enables +# interpretation of all content as UTF-8 by default. To use the +# default browser choice (ISO-8859-1), or to allow the META tags +# in HTML content to override this choice, comment out this +# directive: +# +AddDefaultCharset UTF-8 + + + # + # The mod_mime_magic module allows the server to use various hints from the + # contents of the file itself to determine its type. The MIMEMagicFile + # directive tells the module where the hint definitions are located. + # + MIMEMagicFile conf/magic + + +# +# Customizable error responses come in three flavors: +# 1) plain text 2) local redirects 3) external redirects +# +# Some examples: +#ErrorDocument 500 "The server made a boo boo." +#ErrorDocument 404 /missing.html +#ErrorDocument 404 "/cgi-bin/missing_handler.pl" +#ErrorDocument 402 http://www.example.com/subscription_info.html +# + +# +# EnableMMAP and EnableSendfile: On systems that support it, +# memory-mapping or the sendfile syscall may be used to deliver +# files. This usually improves server performance, but must +# be turned off when serving from networked-mounted +# filesystems or if support for these functions is otherwise +# broken on your system. +# Defaults if commented: EnableMMAP On, EnableSendfile Off +# +#EnableMMAP off +EnableSendfile on + +# Supplemental configuration +# +# Load config files in the "/etc/httpd/conf.d" directory, if any. +IncludeOptional conf.d/*.conf \ No newline at end of file diff --git a/slurm-cluster-chart/files/ood-cluster-config.yml b/slurm-cluster-chart/files/ood-cluster-config.yml new file mode 100644 index 0000000..cc0ab76 --- /dev/null +++ b/slurm-cluster-chart/files/ood-cluster-config.yml @@ -0,0 +1,11 @@ +--- +v2: + metadata: + title: "Slurm Cluster" + login: + host: "localhost" + job: + cluster: "linux" + adapter: "slurm" + bin: "/usr/bin" + conf: "/etc/slurm/slurm.conf" \ No newline at end of file diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml new file mode 100644 index 0000000..d5227b2 --- /dev/null +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -0,0 +1,250 @@ +# Modified from file installed by ondemand package +--- +# +# Portal configuration +# + +# The address and port to listen for connections on +# Example: +# listen_addr_port: 443 +# Default: null (don't add any more listen directives) +#listen_addr_port: 80 + +# The server name used for name-based Virtual Host +# Example: +# servername: 'www.example.com' +# Default: null (don't use name-based Virtual Host) +#servername: 128.232.226.84 +#serverAlias: 128.232.226.84 + +# The port specification for the Virtual Host +# Example: +# port: 8080 +#Default: null (use default port 80 or 443 if SSL enabled) +#port: null + +# List of SSL Apache directives +# Example: +# ssl: +# - 'SSLCertificateFile "/etc/pki/tls/certs/www.example.com.crt"' +# - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"' +# Default: null (no SSL support) +#ssl: null +ssl: +- 'SSLCertificateFile "/etc/pki/tls/certs/localhost.crt"' +- 'SSLCertificateKeyFile "/etc/pki/tls/private/localhost.key"' + +# Root directory of log files (can be relative ServerRoot) +# Example: +# logroot: '/path/to/my/logs' +# Default: 'logs' (this is relative to ServerRoot) +#logroot: 'logs' + +# Root directory of the Lua handler code +# Example: +# lua_root: '/path/to/lua/handlers' +# Default : '/opt/ood/mod_ood_proxy/lib' (default install directory of mod_ood_proxy) +#lua_root: '/opt/ood/mod_ood_proxy/lib' + +# Verbosity of the Lua module logging +# (see https://httpd.apache.org/docs/2.4/mod/core.html#loglevel) +# Example: +# lua_log_level: 'warn' +# Default: 'info' (get verbose logs) +#lua_log_level: 'info' + +# System command used to map authenticated-user to system-user +# Example: +# user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex --regex=''^(\w+)@example.com$''' +# Default: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex' (this echo's back auth-user) +#user_map_cmd: '/opt/ood/ood_auth_map/bin/ood_auth_map.regex' + +# Use an alternative CGI environment variable instead of REMOTE_USER for +# determining the authenticated-user fed to the mapping script +# Example: +# user_env: 'OIDC_CLAIM_preferred_username' +# Default: null (use REMOTE_USER) +#user_env: null + +# Redirect user to the following URI if fail to map there authenticated-user to +# a system-user +# Example: +# map_fail_uri: '/register' +# Default: null (don't redirect, just display error message) +#map_fail_uri: null + +# System command used to run the `nginx_stage` script with sudo privileges +# Example: +# pun_stage_cmd: 'sudo /path/to/nginx_stage' +# Default: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage' (don't forget sudo) +#pun_stage_cmd: 'sudo /opt/ood/nginx_stage/sbin/nginx_stage' + +# List of Apache authentication directives +# NB: Be sure the appropriate Apache module is installed for this +# Default: (see below, uses basic auth with an htpasswd file) +# auth: +# - 'AuthType Basic' +# - 'AuthName "private"' +# - 'AuthUserFile "/opt/rh/httpd24/root/etc/httpd/.htpasswd"' +# - 'RequestHeader unset Authorization' +# - 'Require valid-user' + +# Redirect user to the following URI when accessing root URI +# Example: +# root_uri: '/my_uri' +# # https://www.example.com/ => https://www.example.com/my_uri +# Default: '/pun/sys/dashboard' (default location of the OOD Dashboard app) +#root_uri: '/pun/sys/dashboard' + +# Track server-side analytics with a Google Analytics account and property +# (see https://github.com/OSC/mod_ood_proxy/blob/master/lib/analytics.lua for +# information on how to setup the GA property) +# Example: +# analytics: +# url: 'http://www.google-analytics.com/collect' +# id: 'UA-79331310-4' +# Default: null (do not track) +#analytics: null + +# +# Publicly available assets +# + +# Public sub-uri (available to public with no authentication) +# Example: +# public_uri: '/assets' +# Default: '/public' +#public_uri: '/public' + +# Root directory that serves the public sub-uri (be careful, everything under +# here is open to the public) +# Example: +# public_root: '/path/to/public/assets' +# Default: '/var/www/ood/public' +#public_root: '/var/www/ood/public' + +# +# Logout redirect helper +# + +# Logout sub-uri +# Example +# logout_uri: '/log_me_out' +# NB: If you change this, then modify the Dashboard app with the new sub-uri +# Default: '/logout' (the Dashboard app is by default going to expect this) +#logout_uri: '/logout' + +# Redirect user to the following URI when accessing logout URI +# Example: +# logout_redirect: '/oidc?logout=https%3A%2F%2Fwww.example.com' +# Default: '/pun/sys/dashboard/logout' (the Dashboard app provides a simple +# HTML page explaining logout to the user) +#logout_redirect: '/pun/sys/dashboard/logout' + +# +# Reverse proxy to backend nodes +# + +# Regular expression used for whitelisting allowed hostnames of nodes +# Example: +# host_regex: '[\w.-]+\.example\.com' +# Default: '[^/]+' (allow reverse proxying to all hosts, this allows external +# hosts as well) +#host_regex: '[^/]+' + +# Sub-uri used to reverse proxy to backend web server running on node that +# knows the full URI path +# Example: +# node_uri: '/node' +# Default: null (disable this feature) +#node_uri: null + +# Sub-uri used to reverse proxy to backend web server running on node that +# ONLY uses *relative* URI paths +# Example: +# rnode_uri: '/rnode' +# Default: null (disable this feature) +#rnode_uri: null + +# +# Per-user NGINX Passenger apps +# + +# Sub-uri used to control PUN processes +# Example: +# nginx_uri: '/my_pun_controller' +# Default: '/nginx' +#nginx_uri: '/nginx' + +# Sub-uri used to access the PUN processes +# Example: +# pun_uri: '/my_pun_apps' +# Default: '/pun' +#pun_uri: '/pun' + +# Root directory that contains the PUN Unix sockets that the proxy uses to +# connect to +# Example: +# pun_socket_root: '/path/to/pun/sockets' +# Default: '/var/run/ondemand-nginx' (default location set in nginx_stage) +#pun_socket_root: '/var/run/ondemand-nginx' + +# Number of times the proxy attempts to connect to the PUN Unix socket before +# giving up and displaying an error to the user +# Example: +# pun_max_retries: 25 +# Default: 5 (only try 5 times) +#pun_max_retries: 5 + +# +# Support for OpenID Connect +# + +# Sub-uri used by mod_auth_openidc for authentication +# Example: +# oidc_uri: '/oidc' +# Default: null (disable OpenID Connect support) +#oidc_uri: null + +# Sub-uri user is redirected to if they are not authenticated. This is used to +# *discover* what ID provider the user will login through. +# Example: +# oidc_discover_uri: '/discover' +# Default: null (disable support for discovering OpenID Connect IdP) +#oidc_discover_uri: null + +# Root directory on the filesystem that serves the HTML code used to display +# the discovery page +# Example: +# oidc_discover_root: '/var/www/ood/discover' +# Default: null (disable support for discovering OpenID Connect IdP) +#oidc_discover_root: null + +# +# Support for registering unmapped users +# +# (Not necessary if using regular expressions for mapping users) +# + +# Sub-uri user is redirected to if unable to map authenticated-user to +# system-user +# Example: +# register_uri: '/register' +# Default: null (display error to user if mapping fails) +#register_uri: null + +# Root directory on the filesystem that serves the HTML code used to register +# an unmapped user +# Example: +# register_root: '/var/www/ood/register' +# Default: null (display error to user if mapping fails) +#register_root: null + +host_regex: 'head' +auth: + - 'AuthType Basic' + - 'AuthName "private"' + - 'AuthBasicProvider dbm' + - 'AuthDBMUserFile "/opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm"' + - 'RequestHeader unset Authorization' + - 'Require valid-user' \ No newline at end of file diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index a10c12b..5bc40f7 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -36,6 +36,7 @@ SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=3 SlurmdLogFile=/var/log/slurm/slurmd.log +DebugFlags=Power JobCompType=jobcomp/filetxt JobCompLoc=/var/log/slurm/jobcomp.log # @@ -47,12 +48,23 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_dns,cloud_reg_addrs +SlurmctldParameters=cloud_reg_addrs,idle_on_node_suspend CommunicationParameters=NoAddrCache +ReconfigFlags=KeepPowerSaveSettings +#ResumeFailProgram=TODO? +ResumeProgram=/usr/local/bin/k8s-slurmd-create +#ResumeTimeout=60 # default +#SuspendExcNodes= +#SuspendExcParts= +#SuspendExcStates= +SuspendProgram=/usr/local/bin/k8s-slurmd-delete +SuspendTime=30 # for debugging +#SuspendTimeout= +TreeWidth=65533 # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE CPUs=4 +NodeName=slurmd-[0-9] State=CLOUD CPUs=4 # PARTITIONS PartitionName=all Default=yes Nodes=ALL diff --git a/slurm-cluster-chart/templates/cluster-config-configmap.yaml b/slurm-cluster-chart/templates/cluster-config-configmap.yaml new file mode 100644 index 0000000..914a456 --- /dev/null +++ b/slurm-cluster-chart/templates/cluster-config-configmap.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: cluster-config +data: + ood-cluster-config.yml: | + {{- .Files.Get "files/ood-cluster-config.yml" | nindent 4 -}} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml new file mode 100644 index 0000000..1a1d6ea --- /dev/null +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: database-auth-secret + annotations: + helm.sh/hook: pre-install +data: + password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/generate-keys-hook.yaml b/slurm-cluster-chart/templates/generate-keys-hook.yaml new file mode 100644 index 0000000..c05e7f2 --- /dev/null +++ b/slurm-cluster-chart/templates/generate-keys-hook.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: generate-keys-hook + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "3" +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: generate-keys-hook + spec: + serviceAccountName: secret-generator-account + restartPolicy: Never + containers: + - name: generate-keys-hook + image: {{ .Values.slurmImage }} + args: + - generate-keys-hook diff --git a/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml new file mode 100644 index 0000000..75ad249 --- /dev/null +++ b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml @@ -0,0 +1,9 @@ +#Only applied if sshPublicKey provided in values.yaml, if not assumes you have run publish-keys.sh prior to helm release +{{ if .Values.sshPublicKey }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: helm-authorized-keys-configmap +data: + authorized_keys: {{ .Values.sshPublicKey }} +{{ end }} diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml similarity index 100% rename from slurm-cluster-chart/templates/check-jobs-finished-hook.yaml rename to slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..868cbbd --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slurm-k8s-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: slurm-k8s-cleanup +subjects: +- kind: ServiceAccount + name: slurm-k8s-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: slurm-k8s-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "1" +spec: + template: + metadata: + name: slurm-k8s-pre-delete-cleanup + spec: + serviceAccountName: slurm-k8s-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/httpd-configmap.yaml b/slurm-cluster-chart/templates/httpd-configmap.yaml new file mode 100644 index 0000000..93eb6ea --- /dev/null +++ b/slurm-cluster-chart/templates/httpd-configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: httpd-configmap +data: + httpd.conf: | + {{- .Files.Get "files/httpd.conf" | nindent 4 -}} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index c6f93c9..df8892d 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -11,6 +11,14 @@ spec: - name: ssh port: 22 targetPort: 22 + - name: apache + port: 80 + targetPort: 80 + protocol: TCP + - name: https + port: 443 + targetPort: 443 + protocol: TCP type: LoadBalancer selector: app.kubernetes.io/name: slurm diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 54% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 48f8f17..a02d9e0 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -26,10 +26,15 @@ spec: - login image: {{ .Values.slurmImage }} name: login + env: + - name: ROCKY_OOD_PASS + value: {{ .Values.openOnDemand.password }} ports: - containerPort: 22 + - containerPort: 80 + - containerPort: 443 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -42,16 +47,27 @@ spec: - name: authorized-keys mountPath: /tmp/authorized_keys subPath: authorized_keys + - name: ood-portal + mountPath: /etc/ood/config/ood_portal.yml + subPath: ood_portal.yml + - name: httpd-config + mountPath: /etc/httpd/conf/httpd.conf + subPath: httpd.conf + - name: cluster-config + mountPath: /etc/ood/config/clusters.d/ood-cluster-config.yml + subPath: ood-cluster-config.yml + - name: host-keys + mountPath: /tempmounts/etc/ssh resources: {} hostname: login dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} @@ -64,4 +80,20 @@ spec: defaultMode: 0400 - name: authorized-keys configMap: - name: {{ .Values.configmaps.authorizedKeys }} + {{ if .Values.sshPublicKey }} + name: helm-authorized-keys-configmap + {{ else }} + name: authorized-keys-configmap + {{ end }} + - name: cluster-config + configMap: + name: cluster-config + - name: ood-portal + configMap: + name: ood-portal-configmap + - name: httpd-config + configMap: + name: httpd-configmap + - name: host-keys + secret: + secretName: host-keys-secret diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml new file mode 100644 index 0000000..df97e19 --- /dev/null +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: munge-key-secret + annotations: + helm.sh/hook: pre-install +data: + munge.key: {{ randAscii 128 | b64enc }} diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index 8ffd49e..96dc88f 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.sqlImage }} + image: {{ .Values.database.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/ood-portal-configmap.yaml b/slurm-cluster-chart/templates/ood-portal-configmap.yaml new file mode 100644 index 0000000..6770d82 --- /dev/null +++ b/slurm-cluster-chart/templates/ood-portal-configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ood-portal-configmap +data: + ood_portal.yml: | + {{- .Files.Get "files/ood_portal.yaml" | nindent 4 -}} + \ No newline at end of file diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..aab0856 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storage.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml new file mode 100644 index 0000000..da914be --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: secret-generator-role + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "1" +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["secrets"] + verbs: ["get","apply","create", "patch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: secret-generator-rolebinding + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "2" +subjects: + - kind: ServiceAccount + name: secret-generator-account +roleRef: + kind: Role + name: secret-generator-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml new file mode 100644 index 0000000..ce860b0 --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secret-generator-account + annotations: + "kubernetes.io/enforce-mountable-secrets": "true" + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "0" +automountServiceAccountToken: True +secrets: + - name: host-keys-secret diff --git a/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml new file mode 100644 index 0000000..8bb98c9 --- /dev/null +++ b/slurm-cluster-chart/templates/slurm-autoscaler-service-account.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slurm-autoscaler-account +automountServiceAccountToken: True + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: slurm-autoscaler-role +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get","apply","create", "patch", "delete", "list", "watch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: slurm-autoscaler-rolebinding +subjects: + - kind: ServiceAccount + name: slurm-autoscaler-account +roleRef: + kind: Role + name: slurm-autoscaler-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 74% rename from slurm-cluster-chart/templates/slurmctld-statefulset.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index dc0bf90..bbf8867 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -19,17 +19,18 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld spec: + serviceAccountName: slurm-autoscaler-account containers: - args: - slurmctld - - -vvv + - -vvvvv image: {{ .Values.slurmImage }} name: slurmctld ports: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -40,18 +41,22 @@ spec: name: slurmctld-state dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurmctld-state persistentVolumeClaim: claimName: var-spool-slurmctld - name: slurm-config-volume - configMap: - name: {{ .Values.configmaps.slurmConf }} + projected: + sources: + - configMap: + name: {{ .Values.configmaps.slurmConf }} + - configMap: + name: slurmd-pod-template - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml similarity index 52% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd-template-configmap.yaml index 4c2396e..ef31192 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd-template-configmap.yaml @@ -1,61 +1,57 @@ -apiVersion: apps/v1 -kind: StatefulSet +apiVersion: v1 +kind: ConfigMap metadata: - creationTimestamp: null - labels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd - name: slurmd -spec: - replicas: {{ .Values.replicas.slurmd }} - selector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd - serviceName: slurmd - template: + name: slurmd-pod-template +data: + slurmd-pod-template.yml: | + apiVersion: v1 + kind: Pod metadata: - creationTimestamp: null labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd + name: SLURMD_NODENAME # Irrelevant for DNS but must be be currently-unique so using slurmd name is convenient spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd + hostname: SLURMD_NODENAME # required to create DNS records for pod + subdomain: slurmd # has to match name of headless service to create DNS records for pod containers: - args: - slurmd - - -F - - -vvv + - -b + - -vvvvv + - -N + - SLURMD_NODENAME + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name image: {{ .Values.slurmImage }} name: slurmd ports: - containerPort: 6818 + hostPort: 6818 resources: {} volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet dnsConfig: searches: - - slurmd.default.svc.cluster.local + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/undrain-nodes-hook.yaml b/slurm-cluster-chart/templates/undrain-nodes-hook.yaml new file mode 100644 index 0000000..3c0f189 --- /dev/null +++ b/slurm-cluster-chart/templates/undrain-nodes-hook.yaml @@ -0,0 +1,34 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: undrain-nodes-hook + annotations: + "helm.sh/hook": post-upgrade + "helm.sh/hook-delete-policy": hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: undrain-nodes-hook + spec: + restartPolicy: Never + containers: + - name: undrain-nodes-hook + image: {{ .Values.slurmImage }} + args: + - undrain-nodes-hook + volumeMounts: + - mountPath: /tmp/munge.key + name: munge-key-secret + subPath: munge.key + - mountPath: /etc/slurm/ + name: slurm-config-volume + volumes: + - name: munge-key-secret + secret: + secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 + - name: slurm-config-volume + configMap: + name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 841bb0f..a5f4503 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.databaseStorage }} + storage: {{ .Values.database.storage }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..a88f282 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,24 +1,79 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e +slurmImage: ghcr.io/stackhpc/slurm-k8s-cluster:0602876 -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi # NB yaml anchor used so this value is also set for `rooknfs.storageCapacity` if necessary. + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + # Name given to the RWM StorageClass created by Rook + # NB this must match storage.storageClassName when using Rook + storageClassName: slurm-rook-nfs + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + # Storage class to use for the Read-Write-Once backing PVC + # backingStorageClass: -sqlImage: mariadb:10.10 -databaseStorage: 100Mi +# Values for Slurm's database container +database: + #Database image to be used + image: mariadb:10.10 + #Storage requested by the var-lib-mysql volume backing the database + storage: 100Mi + +# Configmap resource names configmaps: - authorizedKeys: authorized-keys-configmap slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap +# Public key used for ssh access to the login node +# If let undefined, assumes you have run the provided publish-keys.sh script to publish your public key prior to deployment +sshPublicKey: + +# Secret resource names secrets: - databaseAuth: database-auth-secret mungeKey: munge-key-secret - \ No newline at end of file + +openOnDemand: + #Password for default Open OnDemand user 'rocky' + password: password