diff --git a/.github/workflows/workflows.yml b/.github/workflows/workflows.yml new file mode 100644 index 0000000..0a769a1 --- /dev/null +++ b/.github/workflows/workflows.yml @@ -0,0 +1,60 @@ +name: Release Artifacts + +on: + release: + types: [created] + +jobs: + release-controller-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + push: true + tags: ghcr.io/mathworks-ref-arch/matlab-parallel-server-k8s/mjs-controller-image:${{ github.event.release.tag_name }} + context: ./controller + file: ./controller/Dockerfile + + release-helm-chart: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Install Helm + uses: azure/setup-helm@v4 + + - name: Lint the chart + run: helm lint mjs --set maxWorkers=4,matlabPVC=test,checkpointPVC=test,logPVC=test,workerLogPVC=test + + - name: Check chart versions + run: grep "version. ${{ github.event.release.tag_name }}" mjs/Chart.yaml && grep "appVersion. ${{ github.event.release.tag_name }}" mjs/Chart.yaml # Use "." (any character) rather than ":", since ":" breaks YAML parser + + - name: Package the chart + run: echo ${{ github.event.release.tag_name }} && helm package mjs --version ${{ github.event.release.tag_name }} --app-version ${{ github.event.release.tag_name }} + + - name: Login to GitHub Container Registry + run: echo ${{ secrets.HELM_TOKEN }} | helm registry login ghcr.io/hannahpullen --username hannahpullen --password-stdin + + - name: Deploy the chart + run: helm push mjs-${GITHUB_REF#refs/tags/}.tgz oci://ghcr.io/mathworks-ref-arch/matlab-parallel-server-k8s + diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..440f91a --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,17 @@ +MATHWORKS CLOUD REFERENCE ARCHITECTURE LICENSE + +The files in this GitHub repository refer to commercial software products and services, virtual machine images, and related materials of The MathWorks, Inc. (“MathWorks Programs”). MathWorks Programs are separately licensed under the MathWorks Software License Agreement, available in the desktop installation of the MathWorks Programs or in the virtual machine image. The files in this GitHub repository may also refer to third-party software licensed under separate terms provided by such third parties. + +The following license terms apply only to the files in this GitHub repository, including files in this folder and its subfolders, and do not apply to MathWorks Programs. References to “software” and “code” in the following license terms refer to the files in this GitHub repository. + +Copyright (c) 2024, The MathWorks, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. +3. In all cases, the software is, and all modifications and derivatives of the software shall be, licensed to you solely for use in conjunction with MathWorks products and service offerings. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b94d427 --- /dev/null +++ b/README.md @@ -0,0 +1,393 @@ +# MATLAB Parallel Server in Kubernetes + +This repository contains utilities for using MATLAB® Parallel Server in a Kubernetes® cluster. + +## Introduction + +This guide explains how to deploy MATLAB Job Scheduler onto your Kubernetes cluster. +You can then connect to the MATLAB Job Scheduler and use it to run MATLAB Parallel Server jobs on the Kubernetes cluster. + +For more information on MATLAB Job Scheduler and MATLAB Parallel Server, see the MathWorks documentation on [MATLAB Parallel Server](https://www.mathworks.com/help/matlab-parallel-server/index.html). + +## Requirements + +To use MATLAB Job Scheduler in Kubernetes, you must have MATLAB R2024a or later. + +Before you start, you need the following: +- A running Kubernetes cluster that meets the following conditions: + - Uses Kubernetes version 1.21.1 or later. + - Meets the system requirements for running MATLAB Job Scheduler. For details, see the MathWorks documentation for [MATLAB Parallel Server Product Requirements](https://www.mathworks.com/support/requirements/matlab-parallel-server.html). + - Configured to create external load balancers that allow traffic into the cluster. +- Docker® installed on your computer. For help with installing Docker, see [Get Docker](https://docs.docker.com/get-docker/). +- Kubectl installed on your computer and configured to access your Kubernetes cluster. For help with installing Kubectl, see [Install Tools](https://kubernetes.io/docs/tasks/tools/) on the Kubernetes website. +- Helm® version 3.8.0 or later installed on your computer. For help with installing Helm, see [Quickstart Guide](https://helm.sh/docs/intro/quickstart/). +- Network access to the MathWorks Container Registry, `containers.mathworks.com`, and the GitHub® Container registry, `ghcr.io`. +- A MATLAB Parallel Server license. For more information on licensing, see [Determining License Size for MATLAB Parallel Server](https://www.mathworks.com/products/matlab-parallel-server/license-model.html) on the MathWorks website. + +If you do not have a license, submit a request on the MathWorks [Contact Sales](https://www.mathworks.com/company/aboutus/contact_us/contact_sales.html) page. + +## Deployment Steps + +### Create Namespace for MATLAB Job Scheduler + +Kubernetes uses namespaces to separate groups of resources. +To learn more about namespaces, see the Kubernetes documentation for [Namespaces](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/). +To isolate the MATLAB Job Scheduler from other resources on the cluster, you must deploy MATLAB Job Scheduler inside a namespace on your cluster. + +For example, to create a custom namespace with the name `mjs`, run this command: +``` +kubectl create namespace mjs +``` + +The commands in this guide assume that you are using a namespace called `mjs`. +Substitute `mjs` with your namespace when using these commands. + +### Create Persistent Volumes + +MATLAB Job Scheduler uses *PersistentVolumes* to retain data beyond the lifetime of the Kubernetes pods. +Create these volumes using your preferred storage medium. +For instructions, see the [Kubernetes PersistentVolume documentation](https://kubernetes.io/docs/concepts/storage/persistent-volumes/). + +The software requires three PersistentVolumes to retain job data and logs. +You can also use a PersistentVolume to mount your own MATLAB Parallel Server installation onto the MATLAB Job Scheduler pods. +If you do not create a PersistentVolume containing a MATLAB Parallel Server installation, you must use a Docker image that has MATLAB Parallel Server installed. + +Create a PersistentVolume for each of the following applications: +- An empty PersistentVolume with access mode `ReadWriteOnce` for MATLAB Job Scheduler's checkpoint folder, which retains job data after exiting the session +- An empty PersistentVolume with access mode `ReadWriteOnce` to retain logs from the MATLAB Job Scheduler job manager +- An empty PersistentVolume with access mode `ReadWriteMany` to retain logs from the MATLAB Job Scheduler workers +- A PersistentVolume with access mode `ReadOnlyMany` containing a MATLAB Parallel Server installation + +Now create a *PersistentVolumeClaim* for each PersistentVolume. +You can create a PersistentVolumeClaim by using the following example configuration file. +Replace `` with the namespace of the MATLAB Job Scheduler, `` with the PersistentVolumeClaim name, and `` with the amount of storage you want to provision for your PersistentVolumeClaim. +For information about the units you can use for storage capacity, see [Resource Management for Pods and Containers](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/) on the Kubernetes website. +To use a PersistentVolume, replace `` with the name of the PersistentVolume and `` with `""`. +To use a *StorageClass* for dynamic provisioning, replace `` with `""` and `` with the name of the StorageClass. +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +namespace: +metadata: + name: +spec: + volumeName: + storageClassName: + accessModes: + - ReadWriteMany + resources: + requests: + storage: +``` + +### Build MATLAB Parallel Server Docker Image (Optional) + +The MATLAB Job Scheduler pods require a MATLAB Parallel Server installation. +By default, you mount this from a PersistentVolume, as described in the previous step. +If you do not have a MATLAB Parallel Server installation to mount, you can build a Docker image containing a MATLAB Parallel Server installation instead. + +Build a Docker image that contains a MATLAB Parallel Server installation. +- Specify `` as a MATLAB release number with a lowercase `r`. For example, to install MATLAB R2024a, specify `` as `r2024a`. The MATLAB release must be version R2024a or later. +- Specify `` as a space-separated list of MATLAB toolboxes you want to install. The toolbox names must match the product names listed on the MathWorks® product page with any spaces replaced by underscores. For example, to install Parallel Computing Toolbox and Deep Learning Toolbox, specify `` as `Parallel_Computing_Toolbox Deep_Learning_Toolbox`. For a complete list of product names, see [MathWorks Products](https://www.mathworks.com/products.html). +- Specify `` as the Docker tag to use for the image. +``` +docker build https://raw.githubusercontent.com/mathworks-ref-arch/matlab-dockerfile/main/Dockerfile --build-arg MATLAB_INSTALL_LOCATION=/opt/matlab --build-arg MATLAB_RELEASE= --build-arg MATLAB_PRODUCT_LIST="MATLAB MATLAB_Parallel_Server " -t +``` + +Push the image to a repository that is visible to your Kubernetes cluster. + +For more information on building a MATLAB Docker image, see [Create a MATLAB Container Image](https://github.com/mathworks-ref-arch/matlab-dockerfile) in the GitHub repository. + + +### Create Administrator Password Secret + +By default, MATLAB Job Scheduler in Kubernetes runs at security level 2. +At security level 2, jobs and tasks are associated with the submitting user and are password protected. +For details about security levels, see [MATLAB Job Scheduler Security](https://www.mathworks.com/help/matlab-parallel-server/set-matlab-job-scheduler-cluster-security.html) in the MathWorks Help Center. + +When you run MATLAB Job Scheduler with security level 2, you must provide an administrator password. +Create a Kubernetes Secret for your administrator password named `mjs-admin-password` and replace `` with a password of your choice. +``` +kubectl create secret generic mjs-admin-password --from-literal=password= --namespace mjs +``` + +To keep your Kubernetes Secrets secure, enable encryption at rest and restrict access to your namespace using role-based access control. +For more information, see the Kubernetes documentation for [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/). + +### Create Helm Values File + +Create a YAML file containing configuration parameters and values for MATLAB Job Scheduler in Kubernetes. +Copy the following lines into a YAML file, `values.yaml`, and modify the values for your cluster configuration. +```yaml +matlabRelease: r2024a +maxWorkers: 100 +matlabPVC: "matlab-pvc" +checkpointPVC: "checkpoint-pvc" +logPVC: "log-pvc" +workerLogPVC: "worker-log-pvc" +jobManagerUserID: 0 +jobManagerGroupID: 0 +matlabImage: "" +``` +Modify the following values: +- `matlabRelease` — Specify the release number of the MATLAB Parallel Server installation. +- `maxWorkers` — Specify the maximum number of MATLAB Parallel Server workers to run in the cluster. The cluster starts with zero workers and automatically scales up to this number as the cluster becomes busy. +- `matlabPVC` — Specify the name of a PersistentVolumeClaim that is bound to the PersistentVolume with a MATLAB Parallel Server installation. +- `checkpointPVC` — Specify the name of a PersistentVolumeClaim that is bound to a PersistentVolume used to retain job data. +- `logPVC` — Specify the name of a PersistentVolumeClaim that is bound to a PersistentVolume used to retain job manager logs. +- `workerLogPVC` — Specify the name of a PersistentVolumeClaim that is bound to a PersistentVolume used to retain worker logs. +- `jobManagerUserID` — Specify the user ID of the user account that MATLAB Job Scheduler should use to run the job manager pod. The user must have write permission for the checkpoint and log PersistentVolumes. To find the user ID, on a Linux machine, run `id -u`. +- `jobManagerGroupID` — Specify the group ID of the user account that MATLAB Job Scheduler should use to run the job manager pod. The user must have write permission for the checkpoint and log PersistentVolumes. To find the group ID, on a Linux machine, run `id -g`. +- `matlabImage` — Specify the URI of a Docker image that contains a MATLAB Parallel Server installation. Specify a URI only if you built a Docker image instead of mounting a MATLAB Parallel Server installation from a PersistentVolume. If you specify this parameter, set the `matlabPVC` parameter to an empty string (`""`). + +For a full list of the configurable Helm values that you can set in this file, see the [Helm Values](helm_values.md) page. + +### Install Helm Chart + +Install the MATLAB Job Scheduler Helm chart with your custom values file: +``` +helm install mjs oci://ghcr.io/mathworks-ref-arch/matlab-parallel-server-k8s/mjs --values values.yaml --namespace mjs +``` + +Check the status of the MATLAB Job Scheduler pods: +``` +kubectl get pods --namespace mjs +``` +When all pods display `1/1` in the `READY` field, MATLAB Job Scheduler is ready to use. +The output of the `kubectl get pods` command looks something like this when MATLAB Job Scheduler is ready: +``` +NAME READY STATUS RESTARTS AGE +mjs-controller-7884c9d95d-5wq2g 1/1 Running 0 25s +mjs-job-manager-5576468456-q5klv 1/1 Running 0 22s +mjs-ingress-proxy-56787694fd-ssbd4 1/1 Running 0 25s +``` + +The Helm chart automatically creates a Kubernetes load balancer service for you. +Check the status of the service: +``` +kubectl get services -l app=mjs-ingress-proxy --namespace mjs +``` +The output of the `kubectl get services` command looks something like this when the load balancer service is ready: +``` +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT +mjs-ingress-proxy-ed5e5db8 LoadBalancer 10.233.12.53 192.168.1.200 27356:31387/TCP,27359:31664/TCP,30000:32212/TCP +``` + +Configure your firewall so that MATLAB clients can route to the IP address or hostname under the `EXTERNAL-IP` column through the ports this service exposes. +For a description of the ports the load balancer service exposes, see the [Customize Load Balancer](#customize-load-balancer) section. + +If you want the MATLAB client to route to this load balancer through a different hostname, for example, an intermediate server or a DNS entry, set the value of the `clusterHost` parameter in your Helm values file before you install MATLAB Job Scheduler on your Kubernetes cluster. + +## Download Cluster Profile + +The cluster profile is a JSON-format file that allows a MATLAB client to connect to your MATLAB Job Scheduler cluster. + +Download the cluster profile to a `profile.json` file: +``` +kubectl get secrets mjs-cluster-profile --template="{{.data.profile | base64decode}}" --namespace mjs > profile.json +``` + +Share the cluster profile with MATLAB users that want to connect to the cluster. + +By default, connections between MATLAB clients and MATLAB Job Scheduler in Kubernetes are verified using mutual TLS (mTLS). +The MATLAB client must have a cluster profile with the correct certificate to connect to the cluster. +You must store the cluster profile securely and distribute the cluster profile to trusted users through a secure channel. + +## Connect to MATLAB Job Scheduler in Kubernetes + +To connect to MATLAB Job Scheduler and run MATLAB Parallel Server jobs, open MATLAB using the same version you used for MATLAB Job Scheduler. + +Import the cluster profile. +1. On your MATLAB desktop, select **Parallel > Create and Manage Clusters**. +2. Click **Import** in the toolbar. +3. Navigate to the location where you saved the profile you created in the previous step and select it. + +### Validate Cluster + +Cluster validation submits a job of each type to test whether the cluster profile is configured correctly. +In the Cluster Profile Manager, click **Validate**. +If you make a change to the cluster configuration, run cluster validation again to ensure your changes cause no errors. +You do not need to validate the profile each time you use it or each time you start MATLAB. + +## Uninstall MATLAB Job Scheduler + +To uninstall MATLAB Job Scheduler from your Kubernetes cluster, run this command: +``` +helm uninstall mjs --namespace mjs +``` + +Delete the administrator password secret: +``` +kubectl delete secrets mjs-admin-password --namespace mjs +``` + +If you created a custom load balancer service, delete the service: +``` +kubectl delete service mjs-ingress-proxy --namespace mjs +``` + +## Examples + +Create a cluster object using your cluster profile ``: +```matlab +c = parcluster("") +``` + +### Submit Work for Batch Processing + +The `batch` command runs a MATLAB script or function on a worker on the cluster. +For more information about batch processing, see the MathWorks documentation for [`batch`](https://www.mathworks.com/help/parallel-computing/batch.html). + +```matlab +% Create a job and submit it to the cluster +job = batch( ... + c, ... % Cluster object created using parcluster + @sqrt, ... % Function or script to run + 1, ... % Number of output arguments + {[64 100]}); % Input arguments + +% Your MATLAB session is now available to do other work. You can +% continue to create and submit more jobs to the cluster. You can also +% shut down your MATLAB session and come back later. The work +% continues to run on the cluster. After you recreate +% the cluster object using the parcluster function, you can view existing +% jobs using the Jobs property of the cluster object. + +% Wait for the job to complete. If the job is already complete, +% MATLAB does not block the Command Window and this command +% returns the prompt (>>) immediately. +wait(job); + +% Retrieve the output arguments for each task. For this example, +% the output is a 1-by-1 cell array containing the vector [8 10]. +results = fetchOutputs(job) +``` + +### Submit Work for Batch Processing with a Parallel Pool + +You can use the `batch` command to create a parallel pool by using the `'Pool'` name-value argument. + +```matlab +% Create and submit a batch pool job to the cluster +job = batch( + c, ... % Cluster object created using parcluster + @sqrt, ... % Function/script to run + 1, ... % Number of output arguments + {[64 100]}, ... % Input arguments + 'Pool', 3); ... % Use a parallel pool with three workers +``` + +### Open an Interactive Parallel Pool + +A parallel pool is a group of MATLAB workers on which you can interactively run work. +When you run the `parpool` command, MATLAB submits a special job to the cluster to start the workers. +Once the workers start, your MATLAB session connects to them. +For more information about parallel pools, see the MathWorks documentation for [`parpool`](https://www.mathworks.com/help/parallel-computing/parpool.html). + +```matlab +% Open a parallel pool on the cluster. This command +% returns the prompt (>>) when the pool is ready. +pool = parpool(c); + +% List the hosts on which the workers are running. +future = parfevalOnAll(pool, @getenv, 1, 'HOSTNAME') +wait(future); +fetchOutputs(future) + +% Output the numbers 1 to 10 in a parallel for-loop. +% Unlike a regular for-loop, the software does not +% execute iterations of the loop in order. +parfor idx = 1:10 + disp(idx) +end + +% Use the pool to calculate the first 500 magic squares. +parfor idx = 1:500 + magicSquare{idx} = magic(idx); +end +``` + +## Advanced Setup Steps + +### Customize Load Balancer + +MATLAB Job Scheduler in Kubernetes uses a Kubernetes load balancer service to expose MATLAB Job Scheduler to MATLAB clients running outside of the Kubernetes cluster. +By default, the Helm chart creates the load balancer for you. +You can also create and customize your own load balancer service before you install the Helm chart. + +Create a Kubernetes load balancer service `mjs-ingress-proxy` to expose MATLAB Job Scheduler to MATLAB clients running outside of the Kubernetes cluster. +This service needs to open the following ports: +- `basePort + 6` and `basePort + 9`, where `basePort` is the MATLAB Job Scheduler base port (default 27350). The MATLAB client connects to the MATLAB Job Scheduler job manager through these ports. +- All ports in range `poolProxyBasePort` to `poolProxyBasePort + maxNumPoolProxies - 1`, where `poolProxyBasePort` is the pool proxy base port (default 30000). Calculate `maxNumPoolProxies` by dividing the maximum number of workers in your cluster by the number of workers per pool proxy (default 32) and rounding up to the nearest integer. The MATLAB client connects to workers in interactive parallel pools through these ports. + +For example, for a MATLAB Job Scheduler cluster with the default base port (27350), default pool proxy base port (30000) and a maximum size of 64 workers, the maximum number of pool proxies is 2. +To create a load balancer for a cluster with this port configuration, create a YAML file, `load-balancer.yaml`, and copy the following lines. + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: mjs-ingress-proxy +spec: + type: LoadBalancer + selector: + app: mjs-ingress-proxy + ports: + - name: job-manager-27356 + port: 27356 + targetPort: 27356 + protocol: TCP + - name: job-manager-27359 + port: 27359 + targetPort: 27359 + protocol: TCP + - name: pool-proxy-30000 + port: 30000 + targetPort: 30000 + protocol: TCP + - name: pool-proxy-30001 + port: 30001 + targetPort: 30001 + protocol: TCP +``` + + +Modify the file to add annotations if needed. +Create the load balancer. +``` +kubectl apply -f load-balancer.yaml --namespace mjs +``` + +Check the status of the load balancer. +``` +kubectl get services -n mjs mjs-ingress-proxy +``` + +The output from the `kubectl get services` command looks something like this: + +``` +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) +mjs-ingress-proxy LoadBalancer 10.233.55.51 192.168.1.200 27356:31186/TCP,27359:30272/TCP,30000:30576/TCP,30001:32290/TCP +``` + +You must ensure that the output of the `kubectl get services` command displays an IP address or hostname under the `EXTERNAL-IP` column before you continue. +If you do not see an external IP address, wait for some time, then run the same command again. + +If you still do not see an external IP address, make sure your Kubernetes cluster is configured to create external load balancers. + +If your Kubernetes cluster runs in the cloud, edit the security settings of the load balancer to apply the security rules you need. + +## License + +The license for the software in this repository is available in the [LICENSE.md](LICENSE.md) file. + +## Community Support + +[MATLAB Central](https://www.mathworks.com/matlabcentral) + +## Technical Support +To request assistance or additional features, contact [MathWorks Technical Support](https://www.mathworks.com/support/contact_us.html). + +--- + +Copyright 2024 The MathWorks, Inc. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..3a1c31e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,6 @@ +# Reporting Security Vulnerabilities + +If you believe you have discovered a security vulnerability, please report it to +[security@mathworks.com](mailto:security@mathworks.com). Please see +[MathWorks Vulnerability Disclosure Policy for Security Researchers](https://www.mathworks.com/company/aboutus/policies_statements/vulnerability-disclosure-policy.html) +for additional information. diff --git a/controller/Dockerfile b/controller/Dockerfile new file mode 100644 index 0000000..81e3fe8 --- /dev/null +++ b/controller/Dockerfile @@ -0,0 +1,16 @@ +# Copyright 2024 The MathWorks, Inc. + +# Stage 1: Build the controller executable +FROM golang:1.21.6 as builder +WORKDIR /app +COPY src/ /app +RUN go version +RUN go mod tidy +RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o controller /app/cmd/main.go + +# Stage 2: Build the controller image +FROM scratch +LABEL maintainer="The MathWorks" +COPY --from=builder /app/controller /controller + +ENTRYPOINT ["./controller"] diff --git a/controller/src/cmd/main.go b/controller/src/cmd/main.go new file mode 100644 index 0000000..83c66db --- /dev/null +++ b/controller/src/cmd/main.go @@ -0,0 +1,63 @@ +// Package main runs the MJS in Kubernetes controller +// Copyright 2024 The MathWorks, Inc. +package main + +import ( + "controller/internal/config" + "controller/internal/controller" + "controller/internal/logging" + "errors" + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "go.uber.org/zap" +) + +func main() { + config, err := loadConfig() + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + // Create logger + logger, loggerErr := logging.NewLogger(config.ControllerLogfile, config.LogLevel) + if loggerErr != nil { + fmt.Printf("Error creating logger: %v\n", loggerErr) + os.Exit(1) + } + defer logger.Close() + + // Catch SIGTERMs in a channel + cancelChan := make(chan os.Signal, 1) + signal.Notify(cancelChan, syscall.SIGTERM, syscall.SIGINT) + + // Run controller + logger.Info("Starting MJS controller") + scaler, err := controller.NewController(config, logger) + if err != nil { + fmt.Println(err) + logger.Error("Error creating controller", zap.Any("error", err)) + os.Exit(1) + } + go scaler.Run() + + // Block until a cancellation is received + sig := <-cancelChan + logger.Info("Caught signal; shutting down", zap.Any("sig", sig)) + scaler.Stop() +} + +// loadConfig reads the path to a config file from the command line arguments and reads in the config file +func loadConfig() (*config.Config, error) { + var configFile string + flag.StringVar(&configFile, "config", "", "Path to config file") + flag.Parse() + if configFile == "" { + return nil, errors.New("must provide path to config file") + } + return config.LoadConfig(configFile) +} diff --git a/controller/src/go.mod b/controller/src/go.mod new file mode 100644 index 0000000..2138a94 --- /dev/null +++ b/controller/src/go.mod @@ -0,0 +1,62 @@ +module controller + +go 1.21.6 + +require ( + github.com/google/uuid v1.3.0 + github.com/mathworks/mjssetup v1.0.0 + github.com/stretchr/testify v1.8.4 + k8s.io/api v0.29.2 + k8s.io/apimachinery v0.29.2 + k8s.io/client-go v0.29.2 +) + +require ( + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch v4.12.0+incompatible // indirect + github.com/go-logr/logr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/google/gnostic-models v0.6.8 // indirect + github.com/google/gofuzz v1.2.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect + github.com/imdario/mergo v0.3.6 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/moby/spdystream v0.2.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + go.uber.org/multierr v1.10.0 // indirect + golang.org/x/net v0.19.0 // indirect + golang.org/x/oauth2 v0.10.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/term v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect + golang.org/x/time v0.3.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.31.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + k8s.io/klog/v2 v2.110.1 // indirect + k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect + sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/objx v0.5.0 // indirect + go.uber.org/zap v1.26.0 + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/controller/src/go.sum b/controller/src/go.sum new file mode 100644 index 0000000..8bb3e09 --- /dev/null +++ b/controller/src/go.sum @@ -0,0 +1,178 @@ +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= +github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= +github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= +github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/mathworks/mjssetup v1.0.0 h1:qAwsD4C91ff2PB3NYfqNAuR37xDebYsotbuh54KjNY0= +github.com/mathworks/mjssetup v1.0.0/go.mod h1:/93fwwTcDzRdSPChpbQgrnS6yjwwL5Sb9FqIgk/b1iA= +github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= +github.com/moby/spdystream v0.2.0/go.mod h1:f7i0iNDQJ059oMTcWxx8MA/zKFIuD/lY+0GqbN2Wy8c= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= +github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= +github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= +github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= +go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo= +go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ= +go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= +go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= +golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= +golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.16.1 h1:TLyB3WofjdOEepBHAU20JdNC1Zbg87elYofWYAY5oZA= +golang.org/x/tools v0.16.1/go.mod h1:kYVVN6I1mBNoB1OX+noeBjbRk4IUEPa7JJ+TJMEooJ0= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.29.2 h1:hBC7B9+MU+ptchxEqTNW2DkUosJpp1P+Wn6YncZ474A= +k8s.io/api v0.29.2/go.mod h1:sdIaaKuU7P44aoyyLlikSLayT6Vb7bvJNCX105xZXY0= +k8s.io/apimachinery v0.29.2 h1:EWGpfJ856oj11C52NRCHuU7rFDwxev48z+6DSlGNsV8= +k8s.io/apimachinery v0.29.2/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/client-go v0.29.2 h1:FEg85el1TeZp+/vYJM7hkDlSTFZ+c5nnK44DJ4FyoRg= +k8s.io/client-go v0.29.2/go.mod h1:knlvFZE58VpqbQpJNbCbctTVXcd35mMyAAwBdpt4jrA= +k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= +k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI= +k8s.io/utils v0.0.0-20230726121419-3b25d923346b/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= +sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/controller/src/internal/config/config.go b/controller/src/internal/config/config.go new file mode 100644 index 0000000..9e207fc --- /dev/null +++ b/controller/src/internal/config/config.go @@ -0,0 +1,110 @@ +// Package config defines configurable controller settings and enables them to be loaded from a JSON file +// Copyright 2024 The MathWorks, Inc. +package config + +import ( + "encoding/json" + "fmt" + "io" + "os" +) + +// Config contains configurable controller settings +type Config struct { + ControllerLogfile string + BasePort int + CertFileName string + CheckpointBase string + CheckpointPVC string + ClusterHost string + DeploymentName string + EnableServiceLinks bool + JobManagerUID string + IdleStop int + InternalClientsOnly bool + JobManagerName string + JobManagerCPULimit string + JobManagerCPURequest string + JobManagerMemoryLimit string + JobManagerMemoryRequest string + JobManagerGroupID int64 + JobManagerUserID int64 + KubeConfig string + LivenessProbeFailureThreshold int32 + LivenessProbePeriod int32 + LivenessProbeTimeout int32 + LoadBalancerName string + LocalDebugMode bool + LogBase string + LogLevel int + LogPVC string + MatlabImagePullPolicy string + MatlabImage string + MatlabRoot string + MatlabPVC string + MaxWorkers int + MinWorkers int + MJSDefConfigMap string + MJSDefDir string + Namespace string + NetworkLicenseManager string + OverrideWrapperPhoenix bool + Period int + PortsPerWorker int + PoolProxyBasePort int + PoolProxyCPULimit string + PoolProxyCPURequest string + PoolProxyImage string + PoolProxyImagePullPolicy string + PoolProxyMemoryLimit string + PoolProxyMemoryRequest string + ReadyFile string + ResizePath string + RequireClientCertificate bool + RequireScriptVerification bool + SecretDir string + SecretFileName string + SecurityLevel int + StartupProbeFailureThreshold int32 + StartupProbeInitialDelay int32 + StartupProbePeriod int32 + StopWorkerGracePeriod int64 + WorkerCPURequest string + WorkerCPULimit string + WorkerLogPVC string + WorkerMemoryRequest string + WorkerMemoryLimit string + WorkerPassword string + WorkersPerPoolProxy int + WorkerUsername string + UseSecureCommunication bool +} + +// LoadConfig reads a Config object from a JSON file +func LoadConfig(configFile string) (*Config, error) { + file, err := os.Open(configFile) + if err != nil { + return nil, fmt.Errorf("error opening config file: %v", err) + } + defer file.Close() + data, err := io.ReadAll(file) + if err != nil { + return nil, fmt.Errorf("error reading config file: %v", err) + } + var config Config + err = json.Unmarshal(data, &config) + if err != nil { + return nil, fmt.Errorf("error unmarshaling JSON from config file: %v", err) + } + return &config, nil +} + +// RequiresSecret returns true if the cluster configuration requires a shared secret +func (c *Config) RequiresSecret() bool { + return c.UseSecureCommunication || c.RequireClientCertificate || c.RequireScriptVerification +} + +// UsePoolProxy returns true if we should install pool proxies +func (c *Config) UsePoolProxy() bool { + return !c.InternalClientsOnly +} diff --git a/controller/src/internal/controller/controller.go b/controller/src/internal/controller/controller.go new file mode 100644 index 0000000..e0b64d0 --- /dev/null +++ b/controller/src/internal/controller/controller.go @@ -0,0 +1,422 @@ +// Package controller provides code for setting up and automatically rescaling a cluster +// Copyright 2024 The MathWorks, Inc. +package controller + +import ( + "controller/internal/config" + "controller/internal/k8s" + "controller/internal/logging" + "controller/internal/rescaler" + "controller/internal/specs" + "encoding/json" + "fmt" + "math" + "time" + + "github.com/mathworks/mjssetup/pkg/certificate" + "github.com/mathworks/mjssetup/pkg/profile" + "go.uber.org/zap" +) + +// Controller sets up and periodically rescales a cluster +type Controller struct { + config *config.Config + logger *logging.Logger + client k8s.Client + specFactory *specs.SpecFactory + waitForJobManager func() error // Function to wait until the job manager is ready + period time.Duration // Interval between each rescaling operation + rescaler rescaler.Rescaler // Interface to perform the rescaling + stopChan chan bool // Channel to capture stop signals +} + +// NewController constructs a Controller from a given config struct +func NewController(conf *config.Config, logger *logging.Logger) (*Controller, error) { + logger.Debug("Creating controller", zap.Any("config", conf)) + + // Create Kubernetes client + client, err := k8s.NewClient(conf, logger) + if err != nil { + return nil, err + } + + // Get the UID of the deployment in which we are running; we use this to tag all created resources so they are cleaned up by the Kubernetes garbage collector if the controller is removed + uid, err := client.GetControllerDeploymentUID() + if err != nil { + return nil, err + } + + // Create the MJS rescaler + rescaler, err := rescaler.NewMJSRescaler(conf, uid, logger) + if err != nil { + return nil, err + } + + controller := &Controller{ + config: conf, + logger: logger, + specFactory: specs.NewSpecFactory(conf, uid), + period: time.Duration(conf.Period) * time.Second, + rescaler: rescaler, + stopChan: make(chan bool), + client: client, + waitForJobManager: func() error { + return waitForJobManager(client, conf.StartupProbePeriod) + }, + } + + err = controller.setup() + if err != nil { + return nil, err + } + return controller, nil +} + +// Run autoscaling periodically until a stop signal is received +func (c *Controller) Run() { + ticker := time.NewTicker(c.period) + defer ticker.Stop() + for { + select { + case <-c.stopChan: + c.logger.Debug("Stopping controller") + return + case <-ticker.C: + c.rescaler.Rescale() + } + } +} + +// Stop can be called asynchronously to stop the controller from running +func (c *Controller) Stop() { + c.stopChan <- true +} + +// Perform initial MJS cluster setup +func (c *Controller) setup() error { + err := c.checkRequiredResources() + if err != nil { + return err + } + sharedSecret, err := c.createMJSSecrets() + if err != nil { + return err + } + err = c.createJobManager() + if err != nil { + return err + } + return c.createProfile(sharedSecret) +} + +// Check that the prerequiste Kubernetes resources exist +func (c *Controller) checkRequiredResources() error { + checksToRun := []func() error{ + c.checkAdminPassword, + c.checkLoadBalancer, + } + for _, checkFunc := range checksToRun { + err := checkFunc() + if err != nil { + return err + } + } + return nil +} + +// Check that the load balancer service exists and exposes the correct ports +func (c *Controller) checkLoadBalancer() error { + if c.config.InternalClientsOnly { + // There is no load balancer for internal-only mode + return nil + } + + svc, exists, err := c.client.ServiceExists(c.config.LoadBalancerName) + if err != nil { + return err + } + + // Compute the ports we expect this service to expose + requiredPorts := []int{c.config.BasePort + 6, c.config.BasePort + 9} + maxPoolProxies := int(math.Ceil(float64(c.config.MaxWorkers) / float64(c.config.WorkersPerPoolProxy))) + for i := 0; i < maxPoolProxies; i++ { + requiredPorts = append(requiredPorts, c.config.PoolProxyBasePort+i) + } + + // Error if the service does not exist + if !exists { + portPairs := "" + for idx, p := range requiredPorts { + if idx > 0 { + portPairs += "," + } + portPairs += fmt.Sprintf("%d:%d", p, p) + } + exampleCmd := fmt.Sprintf("kubectl create service loadbalancer %s --namespace %s --tcp %s", c.config.LoadBalancerName, c.config.Namespace, portPairs) + return fmt.Errorf(`error: Load balancer service "%s" does not exist in namespace "%s". Create a load balancer service configured for MATLAB Job Scheduler with command: "%s"`, c.config.LoadBalancerName, c.config.Namespace, exampleCmd) + } + + // If the service exists, check that all ports are exposed correctly + exposedPorts := map[int]bool{} + for _, p := range svc.Spec.Ports { + port := int(p.Port) + targetPort := p.TargetPort.IntValue() + if port != targetPort { + return fmt.Errorf(`error: Target port %d does not match service port %d in specification for load balancer service "%s". Modify the service specification so that all target ports match service ports`, targetPort, port, c.config.LoadBalancerName) + } + exposedPorts[port] = true + } + foundMissing := false + missingPorts := "" + for _, p := range requiredPorts { + if !exposedPorts[p] { + if foundMissing { + missingPorts += ", " + } + missingPorts += fmt.Sprintf("%d", p) + foundMissing = true + } + } + if foundMissing { + return fmt.Errorf(`error: Load balancer service "%s" does not expose all ports required by MATLAB Job Scheduler. Missing ports: %s. Modify the service specification to expose all required ports`, c.config.LoadBalancerName, missingPorts) + } + return nil +} + +// Check that the administrator password exists, if needed +func (c *Controller) checkAdminPassword() error { + if c.config.SecurityLevel >= 2 { + // Check the admin password secret exists + adminSecretName := specs.AdminPasswordSecretName + secret, adminPasswordExists, err := c.client.SecretExists(specs.AdminPasswordSecretName) + if err != nil { + return err + } + createSecretInstruction := fmt.Sprintf(`To start an MJS cluster at security level %d, create an administrator password secret with command "kubectl create secret generic %s --from-literal=password= --namespace %s", replacing "" with a password of your choice.`, c.config.SecurityLevel, adminSecretName, c.config.Namespace) + if !adminPasswordExists { + return fmt.Errorf(`error: Administrator password secret "%s" does not exist in namespace "%s". %s`, adminSecretName, c.config.Namespace, createSecretInstruction) + } + + // Check that the secret contains the password key + passwordKey := specs.AdminPasswordKey + if _, ok := secret.Data[passwordKey]; !ok { + return fmt.Errorf(`error: Administrator password secret "%s" does not contain the key "%s". %s`, specs.AdminPasswordSecretName, passwordKey, createSecretInstruction) + } + + } + return nil +} + +// Create MJS secrets +func (c *Controller) createMJSSecrets() (*certificate.SharedSecret, error) { + var sharedSecret *certificate.SharedSecret + var err error + if c.config.RequiresSecret() { + sharedSecret, err = c.createSharedSecret() + if err != nil { + return nil, err + } + } + return sharedSecret, nil +} + +// Create shared secret and certificate for MJS and return the shared secret +func (c *Controller) createSharedSecret() (*certificate.SharedSecret, error) { + secret, alreadyExists, err := c.getExistingSharedSecret() + if err != nil { + return nil, fmt.Errorf("error checking for shared secret: %v", err) + } + if alreadyExists { + return secret, err + } + + // Generate the shared secret + certCreator := certificate.New() + secret, err = certCreator.CreateSharedSecret() + if err != nil { + return nil, err + } + secretBytes, err := json.Marshal(secret) + if err != nil { + return nil, fmt.Errorf("error marshalling shared secret: %v", err) + } + + // Get spec for Kubernetes secret + secretSpec := c.specFactory.GetSecretSpec(specs.SharedSecretName) + secretSpec.Data[c.config.SecretFileName] = secretBytes + + // Generate a certificate if needed + if c.config.RequireClientCertificate { + cert, err := certCreator.GenerateCertificate(secret) + if err != nil { + return nil, err + } + certBytes, err := json.Marshal(cert) + if err != nil { + return nil, fmt.Errorf("error marshalling certificate: %v", err) + } + secretSpec.Data[c.config.CertFileName] = certBytes + } + + // Create the Kubernetes secret + _, err = c.client.CreateSecret(secretSpec) + if err != nil { + return nil, fmt.Errorf("error creating Kubernetes secret for MJS shared secret: %v", err) + } + return secret, nil +} + +// Create a deployment for the MJS job manager; return when the pod is ready +func (c *Controller) createJobManager() error { + // Check whether the deployment already exists; this can occur if the controller container has restarted + alreadyExists, err := c.client.DeploymentExists(specs.JobManagerHostname) + if err != nil { + return fmt.Errorf("error checking for job manager deployment: %v", err) + } + if alreadyExists { + c.logger.Info("found existing job manager deployment", zap.String("name", specs.JobManagerHostname)) + return nil + } + + // Create deployment + deploymentSpec := c.specFactory.GetJobManagerDeploymentSpec() + deployment, err := c.client.CreateDeployment(deploymentSpec) + if err != nil { + return fmt.Errorf("error creating job manager deployment: %v", err) + } + c.logger.Info("created MJS job manager deployment", zap.String("name", deployment.Name)) + + // Wait for the pod to be ready before returning + c.logger.Info("waiting for job manager pod to be ready") + c.waitForJobManager() + c.logger.Info("found ready job manager pod") + return nil +} + +// Profile secret names +const ( + profileSecretName = "mjs-cluster-profile" + profileKey = "profile" +) + +// Create the cluster profile +func (c *Controller) createProfile(sharedSecret *certificate.SharedSecret) error { + _, alreadyExists, err := c.client.SecretExists(profileSecretName) + if err != nil { + return fmt.Errorf("error checking for cluster profile secret: %v", err) + } + if alreadyExists { + c.logger.Info("found existing cluster profile password secret", zap.String("name", profileSecretName)) + return nil + } + + // Get MJS hostname + var clusterHost = c.config.ClusterHost + if clusterHost == "" { + if c.config.InternalClientsOnly { + // Use the job manager hostname if all clients are inside the Kubernetes cluster + clusterHost = c.specFactory.GetServiceHostname(specs.JobManagerHostname) + } else { + // Extract the hostname from the load balancer + var err error + clusterHost, err = c.getExternalAddress() + if err != nil { + return err + } + } + } + + // Generate a certificate for the client if needed + var cert *certificate.Certificate + if c.config.RequireClientCertificate { + cert, err = certificate.New().GenerateCertificate(sharedSecret) + if err != nil { + return fmt.Errorf("error generating certificate for cluster profile: %v", err) + } + } + + // Create the profile + profile := profile.CreateProfile(c.config.JobManagerName, clusterHost, cert) + profBytes, err := json.MarshalIndent(profile, "", " ") + if err != nil { + return fmt.Errorf("error marshaling cluster profile into bytes: %v", err) + } + + // Create Kubernetes secret for profile + secret := c.specFactory.GetSecretSpec(profileSecretName) + secret.Data[profileKey] = profBytes + _, err = c.client.CreateSecret(secret) + if err != nil { + return fmt.Errorf("error creating Kubernetes secret for MJS cluster profile: %v", err) + } + c.logger.Info("created MJS cluster profile secret", zap.String("name", secret.Name)) + + return nil +} + +// Get the external address of MJS +func (c *Controller) getExternalAddress() (string, error) { + addressFound := false + retryPeriod := time.Duration(2 * time.Second) + c.logger.Info("waiting for LoadBalancer service to have external hostname", zap.String("serviceName", c.config.LoadBalancerName)) + address := "" + for !addressFound { + loadBalancer, err := c.client.GetLoadBalancer() + if err != nil { + return "", err + } + for _, ingress := range loadBalancer.Status.LoadBalancer.Ingress { + if ingress.IP != "" { + address = ingress.IP + addressFound = true + } else if ingress.Hostname != "" { + address = ingress.Hostname + addressFound = true + } + } + time.Sleep(retryPeriod) + } + c.logger.Info("found LoadBalancer external hostname", zap.String("hostname", address)) + + // Append the base port + address = fmt.Sprintf("%s:%d", address, c.config.BasePort) + return address, nil +} + +// Extract a shared secret from a Kubernetes secret if one already exists +func (c *Controller) getExistingSharedSecret() (*certificate.SharedSecret, bool, error) { + k8sSecret, alreadyExists, err := c.client.SecretExists(specs.SharedSecretName) + if err != nil { + return nil, false, fmt.Errorf("error checking for shared secret: %v", err) + } + if !alreadyExists { + return nil, false, nil + } + + c.logger.Info("found existing shared secret", zap.String("name", specs.SharedSecretName)) + secretData, hasSecret := k8sSecret.Data[c.config.SecretFileName] + if !hasSecret { + return nil, false, fmt.Errorf("secret file '%s' not found in Kubernetes Secret '%s'", c.config.SecretFileName, k8sSecret.Name) + } + secret, err := certificate.New().LoadSharedSecret(secretData) + if err != nil { + return nil, false, fmt.Errorf("error extracting shared secret from Kubernetes Secret '%s': %v", k8sSecret.Name, err) + } + return secret, true, nil +} + +// Wait for the job manager to be ready +func waitForJobManager(client k8s.Client, retryPeriodSeconds int32) error { + retryPeriod := time.Duration(retryPeriodSeconds) * time.Second + for { + isReady, err := client.IsJobManagerReady() + if err != nil { + return err + } + if isReady { + break + } + time.Sleep(retryPeriod) + } + return nil +} diff --git a/controller/src/internal/controller/controller_test.go b/controller/src/internal/controller/controller_test.go new file mode 100644 index 0000000..aad070a --- /dev/null +++ b/controller/src/internal/controller/controller_test.go @@ -0,0 +1,415 @@ +// Copyright 2024 The MathWorks, Inc. +package controller + +import ( + "encoding/json" + "fmt" + "testing" + "time" + + "github.com/mathworks/mjssetup/pkg/certificate" + "github.com/mathworks/mjssetup/pkg/profile" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes/fake" + + "controller/internal/config" + "controller/internal/k8s" + "controller/internal/logging" + "controller/internal/specs" + mockClient "controller/mocks/k8s" + mockRescaler "controller/mocks/rescaler" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Verify that the controller automatically calls the rescaling function +func TestRunAutoscaling(t *testing.T) { + period := 1 + + mockRescaler := mockRescaler.NewRescaler(t) + mockRescaler.EXPECT().Rescale() + + controller := Controller{ + rescaler: mockRescaler, + period: time.Duration(period) * time.Second, + stopChan: make(chan bool), + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + } + + // Run for long enough to ensure the scaler gets called at least once + runFor := 5.0 * period + doneChan := make(chan bool) + go func() { + controller.Run() + doneChan <- true + }() + time.Sleep(time.Duration(runFor) * time.Second) + controller.Stop() + + // Wait for controller.Run() to return + <-doneChan +} + +func TestVerifySetup(t *testing.T) { + testCases := []struct { + name string + useSecureCommunication bool + securityLevel int + }{ + {"insecure", false, 0}, + {"secure_sl0", true, 0}, + {"insecure_sl2", false, 2}, + {"secure_sl2", true, 2}, + } + for _, tc := range testCases { + t.Run(tc.name, func(tt *testing.T) { + verifySetup(tt, tc.useSecureCommunication, tc.securityLevel, false) + }) + } +} + +// Test use of a custom cluster host name in the cluster profile +func TestCustomClusterHost(t *testing.T) { + verifySetup(t, false, 0, true) +} + +// Test the full setup workflow, with all secrets and the cluster profile being created +func verifySetup(t *testing.T, useSecureCommunication bool, securityLevel int, useCustomHost bool) { + conf := config.Config{ + JobManagerName: "my-k8s-mjs", + Namespace: "test", + DeploymentName: "my-controller", + LoadBalancerName: "my-mjs-loadbalancer", + SecretFileName: "secret.json", + CertFileName: "cert.json", + BasePort: 5000, + PoolProxyBasePort: 30000, + WorkersPerPoolProxy: 100, + MaxWorkers: 10, + UseSecureCommunication: useSecureCommunication, + SecurityLevel: securityLevel, + } + if useCustomHost { + conf.ClusterHost = "my-custom-host" + } + controller, lbAddress := createControllerWithFakeClient(t, &conf) + if securityLevel >= 2 { + createDummyAdminPassword(t, controller) + } + + err := controller.setup() + require.NoError(t, err, "error running first controller setup") + + var secret *certificate.SharedSecret + if useSecureCommunication { + secret = verifySharedSecretCreated(t, controller) + } else { + verifyNoSecret(t, controller.client, specs.SharedSecretName) + } + + verifyJobManagerCreated(t, controller) + + // Check the profile was created with either the custom host name or the load balancer external address + expectedHost := conf.ClusterHost + if !useCustomHost { + expectedHost = fmt.Sprintf("%s:%d", lbAddress, controller.config.BasePort) + } + verifyClusterProfileCreated(t, controller, secret, expectedHost, false) + + // Verify that we can run setup again without erroring + // (this can occur if the controller container restarts, and a previous container already created the resources) + err = controller.setup() + require.NoError(t, err, "error running second controller setup") +} + +// Check we get the expected error when the admin password secret is missing +func TestErrorMissingAdminPassword(t *testing.T) { + testCases := []struct { + securityLevel int + expectError bool + }{ + {1, false}, + {2, true}, + } + for _, tc := range testCases { + t.Run(fmt.Sprintf("sl%d", tc.securityLevel), func(tt *testing.T) { + conf := config.Config{ + SecurityLevel: tc.securityLevel, + } + controller, _ := createControllerWithFakeClient(tt, &conf) + err := controller.setup() + if tc.expectError { + assert.Error(tt, err, "expected error when admin password secret is missing") + assert.Contains(tt, err.Error(), specs.AdminPasswordSecretName, "error message should contain name of admin password secret") + } else { + require.NoError(tt, err, "should not get an error for missing admin password secret with security level < 2") + } + }) + } +} + +func TestWaitForJobManager(t *testing.T) { + client := mockClient.NewClient(t) + + // First return false, then true + client.EXPECT().IsJobManagerReady().Once().Return(false, nil) + client.EXPECT().IsJobManagerReady().Once().Return(true, nil) + + err := waitForJobManager(client, 1) + require.NoError(t, err) +} + +// Verify that the correct profile is created with different settings for RequireClientCertificate +func TestCreateProfile(t *testing.T) { + testCases := []struct { + name string + requireClientCertificate bool + }{ + {"no_client_cert", false}, + {"with_client_cert", true}, + } + for _, tc := range testCases { + t.Run(tc.name, func(tt *testing.T) { + conf := config.Config{ + RequireClientCertificate: tc.requireClientCertificate, + JobManagerName: tc.name, + } + controller, lbAddress := createControllerWithFakeClient(tt, &conf) + var secret *certificate.SharedSecret + var err error + if tc.requireClientCertificate { + secret, err = certificate.New().CreateSharedSecret() + require.NoError(tt, err) + } + controller.createProfile(secret) + expectedHost := fmt.Sprintf("%s:%d", lbAddress, controller.config.BasePort) + verifyClusterProfileCreated(tt, controller, secret, expectedHost, tc.requireClientCertificate) + }) + } +} + +// Verify that we get an error when the load balancer has not been created prior to setup +func TestErrorMissingLoadBalancer(t *testing.T) { + conf := config.Config{ + LoadBalancerName: "missing-lb", + } + controller, _ := createControllerWithFakeClient(t, &conf) + + // Delete the Load Balancer + err := controller.client.DeleteService(conf.LoadBalancerName) + require.NoError(t, err) + + // Check that setup fails with an appropriate error + err = controller.setup() + assert.Error(t, err, "expected an error when attempting to set up controller with missing load balancer") + assert.Contains(t, err.Error(), conf.LoadBalancerName, "expected error message to contain name of missing load balancer") +} + +// Verify that we get an error when the load balancer has a mismatched port/targetport +func TestErrorLoadBalancerMismatch(t *testing.T) { + conf := config.Config{ + LoadBalancerName: "my-lb", + } + controller, _ := createControllerWithFakeClient(t, &conf) + + // Modify the load balancer + svc, err := controller.client.GetService(conf.LoadBalancerName) + require.NoError(t, err) + targetPort := svc.Spec.Ports[0].TargetPort.IntVal + newPort := targetPort + 1 + svc.Spec.Ports[0].Port = newPort + err = controller.client.UpdateService(svc) + require.NoError(t, err) + + // Check that setup fails with an appropriate error + err = controller.setup() + assert.Error(t, err, "expected an error when attempting to set up controller with a load balancer where the target port does not match the service port") + assert.Contains(t, err.Error(), fmt.Sprintf("%d", targetPort), "expected error message to contain mismatched target port") + assert.Contains(t, err.Error(), fmt.Sprintf("%d", newPort), "expected error message to contain mismatched service port") +} + +// Verify that we get an error when the load balancer is missing a required port +func TestErrorLoadBalancerMissingPort(t *testing.T) { + conf := config.Config{ + LoadBalancerName: "my-lb", + WorkersPerPoolProxy: 2, + MaxWorkers: 4, + BasePort: 1000, + PoolProxyBasePort: 2000, + } + requiredPorts := []int{conf.BasePort + 6, conf.BasePort + 9, conf.PoolProxyBasePort, conf.PoolProxyBasePort + 1} + + for _, port := range requiredPorts { + controller, _ := createControllerWithFakeClient(t, &conf) + + // Modify the load balancer to remove a required port + svc, err := controller.client.GetService(conf.LoadBalancerName) + require.NoError(t, err) + portsToKeep := []corev1.ServicePort{} + for _, svcPort := range svc.Spec.Ports { + if svcPort.Port != int32(port) { + portsToKeep = append(portsToKeep, svcPort) + } + } + require.Len(t, portsToKeep, len(requiredPorts)-1, "expected one port to be removed from the load balancer") + svc.Spec.Ports = portsToKeep + err = controller.client.UpdateService(svc) + require.NoError(t, err) + + // Check that setup fails with an appropriate error + err = controller.setup() + assert.Error(t, err, "expected an error when attempting to set up controller with a load balancer with a missing port") + assert.Contains(t, err.Error(), fmt.Sprintf("%d", port), "expected error message to contain missing port number") + } +} + +// Test setup for a cluster with internalClientsOnly=true +func TestInternalClientsOnly(t *testing.T) { + conf := config.Config{ + JobManagerName: "my-k8s-mjs", + Namespace: "test", + DeploymentName: "my-controller", + LoadBalancerName: "my-mjs-loadbalancer", + InternalClientsOnly: true, + MaxWorkers: 10, + } + zl := zaptest.NewLogger(t) + fakeK8s := fake.NewSimpleClientset() + specFactory := specs.NewSpecFactory(&conf, types.UID("abc123")) + client := k8s.NewClientWithK8sBackend(&conf, fakeK8s, logging.NewFromZapLogger(zl)) + controller := &Controller{ + client: client, + config: &conf, + logger: logging.NewFromZapLogger(zl), + specFactory: specFactory, + waitForJobManager: func() error { return nil }, + } + + err := controller.setup() + require.NoError(t, err, "error running first controller setup") + verifyJobManagerCreated(t, controller) + + // Check the profile was created with the internal hostname of the job manager + expectedHost := specFactory.GetServiceHostname(specs.JobManagerHostname) + verifyClusterProfileCreated(t, controller, nil, expectedHost, false) + + // Verify that we can run setup again without erroring + // (this can occur if the controller container restarts, and a previous container already created the resources) + err = controller.setup() + require.NoError(t, err, "error running second controller setup") +} + +// Verify that a shared secret was added to the K8s cluster +func verifySharedSecretCreated(t *testing.T, controller *Controller) *certificate.SharedSecret { + secret, exists, err := controller.getExistingSharedSecret() + require.NoError(t, err) + require.True(t, exists, "shared secret should exist") + require.NotNil(t, secret, "shared secret should not be nil") + return secret +} + +// Verify that a secret does not exist +func verifyNoSecret(t *testing.T, client k8s.Client, name string) { + _, exists, err := client.SecretExists(name) + require.NoError(t, err) + assert.Falsef(t, exists, "secret %s should not exist", name) +} + +// Verify that the job manager was created +func verifyJobManagerCreated(t *testing.T, controller *Controller) { + expectedDeployment := controller.specFactory.GetJobManagerDeploymentSpec() + _, err := controller.client.GetDeployment(expectedDeployment.Name) + require.NoError(t, err) +} + +// Verify that the cluster profile was created +func verifyClusterProfileCreated(t *testing.T, controller *Controller, secret *certificate.SharedSecret, expectedHost string, expectCertInProfile bool) { + k8sSecret, exists, err := controller.client.SecretExists(profileSecretName) + require.NoError(t, err) + require.True(t, exists, "cluster profile secret should exist") + require.Contains(t, k8sSecret.Data, profileKey, "profile secret should contain profile data key") + + // Extract the profile + profBytes := k8sSecret.Data[profileKey] + var profile profile.Profile + err = json.Unmarshal(profBytes, &profile) + require.NoError(t, err, "error unmarshaling profile from K8s secret") + + // Check the profile contents + assert.Equal(t, controller.config.JobManagerName, profile.Name, "profile name should match job manager name") + assert.Equal(t, expectedHost, profile.SchedulerComponent.Host, "unexpected profile host") + if expectCertInProfile { + assert.Equal(t, secret.CertPEM, profile.SchedulerComponent.Certificate, "profile server certificate should match shared secret certificate") + } else { + assert.Empty(t, profile.SchedulerComponent.Certificate, "profile certificate should be empty when not using a shared secret") + } +} + +// Create controller and mock K8s client with a Load Balancer +func createControllerWithFakeClient(t *testing.T, conf *config.Config) (*Controller, string) { + zl := zaptest.NewLogger(t) + specFactory := specs.NewSpecFactory(conf, types.UID("abcd")) + fakeK8s := fake.NewSimpleClientset() + client := k8s.NewClientWithK8sBackend(conf, fakeK8s, logging.NewFromZapLogger(zl)) + + // Create a dummy LoadBalancer on the cluster + lbAddress := "1.2.3.4" + lb := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: conf.LoadBalancerName, + }, + Status: corev1.ServiceStatus{ + LoadBalancer: corev1.LoadBalancerStatus{ + Ingress: []corev1.LoadBalancerIngress{ + { + IP: lbAddress, + }, + }, + }, + }, + } + + // Add job manager ports to the load balancer + addPortToService(&lb, conf.BasePort+6) + addPortToService(&lb, conf.BasePort+9) + + // Add pool proxy ports to the load balancer + workersCovered := 0 + idx := 0 + for workersCovered < conf.MaxWorkers { + addPortToService(&lb, conf.PoolProxyBasePort+idx) + idx++ + workersCovered += conf.WorkersPerPoolProxy + } + + _, err := client.CreateService(&lb) + require.NoError(t, err, "error creating dummy load balancer") + + controller := &Controller{ + client: client, + config: conf, + logger: logging.NewFromZapLogger(zl), + specFactory: specFactory, + waitForJobManager: func() error { return nil }, + } + return controller, lbAddress +} + +func createDummyAdminPassword(t *testing.T, controller *Controller) { + secretSpec := controller.specFactory.GetSecretSpec(specs.AdminPasswordSecretName) + secretSpec.Data[specs.AdminPasswordKey] = []byte("testpw") + _, err := controller.client.CreateSecret(secretSpec) + require.NoError(t, err) +} + +func addPortToService(svc *corev1.Service, port int) { + svc.Spec.Ports = append(svc.Spec.Ports, corev1.ServicePort{ + Port: int32(port), + TargetPort: intstr.FromInt(port), + }) +} diff --git a/controller/src/internal/k8s/k8s.go b/controller/src/internal/k8s/k8s.go new file mode 100644 index 0000000..ac42597 --- /dev/null +++ b/controller/src/internal/k8s/k8s.go @@ -0,0 +1,373 @@ +// Package k8s contains methods for interacting with MJS resources in a Kubernetes cluster. +// Copyright 2024 The MathWorks, Inc. +package k8s + +import ( + "bytes" + "context" + "controller/internal/config" + "controller/internal/logging" + "controller/internal/specs" + "fmt" + "path/filepath" + "time" + + "go.uber.org/zap" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/remotecommand" + "k8s.io/client-go/util/homedir" +) + +// Interface for interacting with Kubernetes resources +type Client interface { + CreateDeployment(*appsv1.Deployment) (*appsv1.Deployment, error) + GetDeployment(string) (*appsv1.Deployment, error) + DeleteDeployment(string) error + DeploymentExists(string) (bool, error) + CreateService(*corev1.Service) (*corev1.Service, error) + GetService(string) (*corev1.Service, error) + UpdateService(*corev1.Service) error + DeleteService(string) error + ServiceExists(string) (*corev1.Service, bool, error) + CreateSecret(*corev1.Secret) (*corev1.Secret, error) + GetSecret(string) (*corev1.Secret, error) + DeleteSecret(string) error + SecretExists(string) (*corev1.Secret, bool, error) + GetLoadBalancer() (*corev1.Service, error) + GetPodsWithLabel(string) (*corev1.PodList, error) + GetDeploymentsWithLabel(string) (*appsv1.DeploymentList, error) + GetServicesWithLabel(string) (*corev1.ServiceList, error) + IsJobManagerReady() (bool, error) + GetJobManagerPod() (*corev1.Pod, error) + ExecOnPod(string, []string) (*bytes.Buffer, error) + GetControllerDeploymentUID() (types.UID, error) +} + +// Implementation of clientImpl +type clientImpl struct { + client kubernetes.Interface + config *config.Config + kubeConfig *rest.Config + logger *logging.Logger +} + +// Create a new client +func NewClient(conf *config.Config, logger *logging.Logger) (Client, error) { + kubeConfig, err := getKubeConfig(conf) + if err != nil { + return nil, err + } + k8sclientImpl, err := kubernetes.NewForConfig(kubeConfig) + if err != nil { + return nil, fmt.Errorf("error creating Kubernetes client: %v", err) + } + return &clientImpl{ + config: conf, + client: k8sclientImpl, + kubeConfig: kubeConfig, + logger: logger, + }, nil +} + +// Create a new client with a given Kubernetes backend client +func NewClientWithK8sBackend(conf *config.Config, k8sClient kubernetes.Interface, logger *logging.Logger) Client { + return &clientImpl{ + config: conf, + client: k8sClient, + logger: logger, + } +} + +func (c *clientImpl) CreateDeployment(spec *appsv1.Deployment) (*appsv1.Deployment, error) { + c.logger.Debug("creating Kubernetes Deployment", zap.String("name", spec.Name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.AppsV1().Deployments(c.config.Namespace).Create(ctx, spec, metav1.CreateOptions{}) +} + +func (c *clientImpl) GetDeployment(name string) (*appsv1.Deployment, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.AppsV1().Deployments(c.config.Namespace).Get(ctx, name, metav1.GetOptions{}) +} + +func (c *clientImpl) DeleteDeployment(name string) error { + c.logger.Debug("deleting Kubernetes Deployment", zap.String("name", name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.AppsV1().Deployments(c.config.Namespace).Delete(ctx, name, metav1.DeleteOptions{}) +} + +func (c *clientImpl) DeploymentExists(name string) (bool, error) { + _, err := c.GetDeployment(name) + if err != nil { + if k8serrors.IsNotFound(err) { + return false, nil + } + return false, err + } + return true, nil +} + +func (c *clientImpl) CreateService(spec *corev1.Service) (*corev1.Service, error) { + c.logger.Debug("creating Kubernetes Service", zap.String("name", spec.Name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Services(c.config.Namespace).Create(ctx, spec, metav1.CreateOptions{}) +} + +func (c *clientImpl) GetService(name string) (*corev1.Service, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Services(c.config.Namespace).Get(ctx, name, metav1.GetOptions{}) +} + +func (c *clientImpl) UpdateService(spec *corev1.Service) error { + c.logger.Debug("updating Kubernetes Service", zap.String("name", spec.Name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + _, err := c.client.CoreV1().Services(c.config.Namespace).Update(ctx, spec, metav1.UpdateOptions{}) + return err +} + +func (c *clientImpl) DeleteService(name string) error { + c.logger.Debug("deleting Kubernetes Service", zap.String("name", name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Services(c.config.Namespace).Delete(ctx, name, metav1.DeleteOptions{}) +} + +func (c *clientImpl) ServiceExists(name string) (*corev1.Service, bool, error) { + svc, err := c.GetService(name) + if err != nil { + if k8serrors.IsNotFound(err) { + return nil, false, nil + } + return nil, false, err + } + return svc, true, nil +} + +func (c *clientImpl) CreateSecret(spec *corev1.Secret) (*corev1.Secret, error) { + c.logger.Debug("creating Kubernetes Secret", zap.String("name", spec.Name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Secrets(c.config.Namespace).Create(ctx, spec, metav1.CreateOptions{}) +} + +func (c *clientImpl) GetSecret(name string) (*corev1.Secret, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Secrets(c.config.Namespace).Get(ctx, name, metav1.GetOptions{}) +} + +func (c *clientImpl) DeleteSecret(name string) error { + c.logger.Debug("deleting Kubernetes Secret", zap.String("name", name)) + ctx, cancelFunc := newContext() + defer cancelFunc() + return c.client.CoreV1().Secrets(c.config.Namespace).Delete(ctx, name, metav1.DeleteOptions{}) +} + +func (c *clientImpl) SecretExists(name string) (*corev1.Secret, bool, error) { + secret, err := c.GetSecret(name) + if err != nil { + if k8serrors.IsNotFound(err) { + return nil, false, nil + } + return nil, false, err + } + return secret, true, nil +} + +// GetLoadBalancer gets the spec of the external LoadBalancer service for the MJS cluster +func (c *clientImpl) GetLoadBalancer() (*corev1.Service, error) { + lbName := c.config.LoadBalancerName + lbSpec, err := c.GetService(lbName) + if err != nil { + return nil, fmt.Errorf("error getting MJS LoadBalancer %s: %v", lbName, err) + } + return lbSpec, nil +} + +func (c *clientImpl) GetPodsWithLabel(label string) (*corev1.PodList, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + pods, err := c.client.CoreV1().Pods(c.config.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: label, + }) + if err != nil { + return nil, fmt.Errorf("error getting pods with label '%s': %v", label, err) + } + return pods, nil +} + +func (c *clientImpl) GetDeploymentsWithLabel(label string) (*appsv1.DeploymentList, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + deployments, err := c.client.AppsV1().Deployments(c.config.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: label, + }) + if err != nil { + return nil, fmt.Errorf("error getting deployments with label '%s': %v", label, err) + } + return deployments, nil +} + +func (c *clientImpl) GetServicesWithLabel(label string) (*corev1.ServiceList, error) { + ctx, cancelFunc := newContext() + defer cancelFunc() + services, err := c.client.CoreV1().Services(c.config.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: label, + }) + if err != nil { + return nil, fmt.Errorf("error getting services with label '%s': %v", label, err) + } + return services, nil +} + +// IsJobManagerReady returns true if the job manager pod is ready +func (c *clientImpl) IsJobManagerReady() (bool, error) { + pods, err := c.GetPodsWithLabel(c.getJobManagerPodLabel()) + if err != nil { + return false, err + } + _, isReady := findReadyPod(pods, specs.JobManagerHostname) + return isReady, nil +} + +// GetJobManagerPod gets the job manager pod and returns an error if no job manager pod is ready +func (c *clientImpl) GetJobManagerPod() (*corev1.Pod, error) { + pods, err := c.GetPodsWithLabel(c.getJobManagerPodLabel()) + if err != nil { + return nil, err + } + readyPod, isReady := findReadyPod(pods, specs.JobManagerHostname) + if !isReady { + return nil, fmt.Errorf("found %d job manager pods, but none were ready", len(pods.Items)) + } + return readyPod, nil +} + +// Execute a command on a pod and return the stdout in a byte buffer +func (c *clientImpl) ExecOnPod(podName string, cmd []string) (*bytes.Buffer, error) { + // Create REST request + req := c.client.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(podName). + Namespace(c.config.Namespace). + SubResource("exec") + req.VersionedParams(&corev1.PodExecOptions{ + Command: cmd, + Stdout: true, + Stderr: true, + Container: specs.JobManagerHostname, + }, scheme.ParameterCodec) + + // Create executor + exec, err := remotecommand.NewSPDYExecutor(c.kubeConfig, "POST", req.URL()) + if err != nil { + return nil, err + } + + // Stream results + stdoutBuf := &bytes.Buffer{} + stderrBuf := &bytes.Buffer{} + ctx, cancelFunc := newContext() + defer cancelFunc() + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: stdoutBuf, + Stderr: stderrBuf, + }) + if err != nil { + return nil, fmt.Errorf("error executing remote command: %v, stdout: %s, stderr: %s", err, stdoutBuf.String(), stderrBuf.String()) + } + return stdoutBuf, nil +} + +// Get the UID of the Deployment in which the current controller is running +func (c *clientImpl) GetControllerDeploymentUID() (types.UID, error) { + if c.config.LocalDebugMode { + // In LocalDebugMode, the controller runs outside of the Kubernetes cluster so there is no deployment; return an empty UID + return types.UID(""), nil + } + deployment, err := c.GetDeployment(c.config.DeploymentName) + if err != nil { + return "", fmt.Errorf("error getting controller deployment: %v", err) + } + return deployment.UID, nil +} + +// Return a pod from a list of pods that has the expected container in the "ready" state +func findReadyPod(pods *corev1.PodList, containerName string) (*corev1.Pod, bool) { + numPods := len(pods.Items) + if numPods == 0 { + // No pods yet + return nil, false + } + + // There can be multiple pods if one was recently terminated and another has started; find the pod that is ready + for _, pod := range pods.Items { + isReady, _ := hasReadyContainer(&pod, containerName) + if isReady { + return &pod, true + } + } + return nil, false +} + +// hasReadyContainer returns true if the specified container is found in a pod, and that container is ready +// An error is returned if the specified container is not found +func hasReadyContainer(pod *corev1.Pod, containerName string) (bool, error) { + foundContainer := false + ready := false + for _, c := range pod.Status.ContainerStatuses { + if c.Name == containerName { + foundContainer = true + ready = c.Ready + } + } + if !foundContainer { + return false, fmt.Errorf("container %s not found in pod %s", containerName, pod.Name) + } + return ready, nil +} + +func (c *clientImpl) getJobManagerPodLabel() string { + return fmt.Sprintf("%s=%s", specs.JobManagerUIDKey, c.config.JobManagerUID) +} + +// getKubeConfig returns a Kubernetes REST config object, used to make RESTful Kubernetes API calls +func getKubeConfig(conf *config.Config) (*rest.Config, error) { + var config *rest.Config + var err error + + if conf.LocalDebugMode { + kubeConfig := conf.KubeConfig + if kubeConfig == "" { + kubeConfig = filepath.Join(homedir.HomeDir(), ".kube", "config") + } + config, err = clientcmd.BuildConfigFromFlags("", kubeConfig) + } else { + config, err = rest.InClusterConfig() + } + if err != nil { + return nil, fmt.Errorf("error getting Kubernetes REST config: %v", err) + } + return config, nil +} + +// Timeout to use when calling Kubernetes API +const Timeout = 60 + +// newContext creates a context to use when calling the K8s client +func newContext() (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), Timeout*time.Second) +} diff --git a/controller/src/internal/k8s/k8s_test.go b/controller/src/internal/k8s/k8s_test.go new file mode 100644 index 0000000..2be343b --- /dev/null +++ b/controller/src/internal/k8s/k8s_test.go @@ -0,0 +1,353 @@ +// Copyright 2024 The MathWorks, Inc. +package k8s + +import ( + "context" + "controller/internal/config" + "controller/internal/logging" + "controller/internal/specs" + "fmt" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/fake" +) + +const ( + namespace = "test-namespace" + jobManagerUID = "jm-1234" +) + +// Test the deployment methods +func TestDeployments(t *testing.T) { + client, _ := newFakeClient(t) + depToCreate := appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deployment1", + }, + } + + // Test deployment creation + createdDep, err := client.CreateDeployment(&depToCreate) + require.NoError(t, err) + assert.Equal(t, depToCreate.Name, createdDep.Name) + assert.Equal(t, namespace, createdDep.Namespace, "deployment should have been created in test namespace") + + // Test getting deployment + gotDep, err := client.GetDeployment(depToCreate.Name) + require.NoError(t, err) + assert.Equal(t, createdDep, gotDep) + + // Test the DeploymentExists method + exists, err := client.DeploymentExists(createdDep.Name) + require.NoError(t, err) + assert.True(t, exists, "deployment should exist") + existsFalse, err := client.DeploymentExists("not-a-deployment") + require.NoError(t, err) + assert.False(t, existsFalse, "deployment should not exist") + + // Create a deployment with a label + labelKey := "myKey" + labelVal := "myVal" + depWithLabels := appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deployment-labelled", + Labels: map[string]string{ + labelKey: labelVal, + }, + }, + } + _, err = client.CreateDeployment(&depWithLabels) + require.NoError(t, err) + + // Verify that we can get the labelled deployment + labelSelector := fmt.Sprintf("%s=%s", labelKey, labelVal) + gotDeps, err := client.GetDeploymentsWithLabel(labelSelector) + require.NoError(t, err) + assert.Len(t, gotDeps.Items, 1, "should have found 1 deployment with label") + assert.Equal(t, depWithLabels.Name, gotDeps.Items[0].Name, "should have found deployment with label") + + // Test deployment deletion + err = client.DeleteDeployment(depWithLabels.Name) + require.NoError(t, err) + gotDeps, err = client.GetDeploymentsWithLabel(labelSelector) + require.NoError(t, err) + assert.Empty(t, gotDeps.Items, "should not have found any deployments with label after deletion") + err = client.DeleteDeployment(depToCreate.Name) + require.NoError(t, err) + _, err = client.GetDeployment(depToCreate.Name) + assert.Error(t, err, "should get an error when attempting to get deleted deployment") +} + +// Test the service methods +func TestServices(t *testing.T) { + // Test service creation + client, _ := newFakeClient(t) + svcSpec := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-service", + }, + } + createdSvc, err := client.CreateService(&svcSpec) + require.NoError(t, err) + assert.Equal(t, svcSpec.Name, createdSvc.Name) + assert.Equal(t, namespace, createdSvc.Namespace, "service should have been created in test namespace") + + // Test getting the service + gotSvc, err := client.GetService(svcSpec.Name) + require.NoError(t, err) + assert.Equal(t, createdSvc, gotSvc) + + // Test checking that the service exists + gotSvc2, exists, err := client.ServiceExists(svcSpec.Name) + require.NoError(t, err) + assert.True(t, exists, "service should exist") + assert.Equal(t, gotSvc, gotSvc2, "ServiceExists should return the service if it exists") + + // Test checking a nonexistant service + _, exists, err = client.ServiceExists("not-real") + require.NoError(t, err) + assert.False(t, exists, "service should not exist") + + // Test updating the service + svcSpec.Spec.Ports = append(svcSpec.Spec.Ports, corev1.ServicePort{Port: 8080}) + err = client.UpdateService(&svcSpec) + require.NoError(t, err) + gotSvcAfterUpdate, err := client.GetService(svcSpec.Name) + require.NoError(t, err) + assert.NotEqual(t, gotSvc, gotSvcAfterUpdate) + assert.Equal(t, svcSpec.Spec, gotSvcAfterUpdate.Spec) + + // Create a service with a label + labelKey := "myKey" + labelVal := "myVal" + svcWithLabel := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "svc-labelled", + Labels: map[string]string{ + labelKey: labelVal, + }, + }, + } + _, err = client.CreateService(&svcWithLabel) + require.NoError(t, err) + + // Verify that we can get the labelled service + labelSelector := fmt.Sprintf("%s=%s", labelKey, labelVal) + gotDeps, err := client.GetServicesWithLabel(labelSelector) + require.NoError(t, err) + assert.Len(t, gotDeps.Items, 1, "should have found 1 service with label") + assert.Equal(t, svcWithLabel.Name, gotDeps.Items[0].Name, "should have found service with label") + + // Test service deletion + err = client.DeleteService(svcSpec.Name) + require.NoError(t, err) + _, err = client.GetService(svcSpec.Name) + assert.Error(t, err, "should get an error when attempting to get deleted service") +} + +// Test the secret methods +func TestSecrets(t *testing.T) { + // Test secret creation + client, _ := newFakeClient(t) + secret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-secret", + }, + } + createdSecret, err := client.CreateSecret(&secret) + require.NoError(t, err) + assert.Equal(t, secret.Name, createdSecret.Name) + assert.Equal(t, namespace, createdSecret.Namespace, "secret should have been created in test namespace") + + // Test getting a secret + gotSecret, err := client.GetSecret(secret.Name) + require.NoError(t, err) + assert.Equal(t, createdSecret, gotSecret) + + // Test checking whether secret exists + existingSecret, existsTrue, err := client.SecretExists(secret.Name) + require.NoError(t, err) + assert.True(t, existsTrue, "secret should exist") + assert.NotNil(t, existingSecret) + notExistingSecret, existsFalse, err := client.SecretExists("not-a-secret") + require.NoError(t, err) + assert.False(t, existsFalse, "secret should not exist") + assert.Nil(t, notExistingSecret) + + // Test deleting a secret + err = client.DeleteSecret(secret.Name) + require.NoError(t, err) + _, exists, err := client.SecretExists(secret.Name) + require.NoError(t, err) + assert.False(t, exists, "secret should no longer exist after deletion") +} + +func TestGetPodsWithLabel(t *testing.T) { + podNoLabel := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-label", + }, + } + labelKey := "myPod" + labelVal := "labelled" + podWithLabel := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "with-label", + Labels: map[string]string{ + labelKey: labelVal, + }, + }, + } + + // Create the pods on a fake K8s backend + client, k8sBackend := newFakeClient(t) + _, err := k8sBackend.CoreV1().Pods(client.config.Namespace).Create(context.Background(), &podNoLabel, metav1.CreateOptions{}) + require.NoError(t, err) + _, err = k8sBackend.CoreV1().Pods(client.config.Namespace).Create(context.Background(), &podWithLabel, metav1.CreateOptions{}) + require.NoError(t, err) + + // Check we can get the expected pod + labelSelector := fmt.Sprintf("%s=%s", labelKey, labelVal) + gotPods, err := client.GetPodsWithLabel(labelSelector) + require.NoError(t, err) + assert.Len(t, gotPods.Items, 1, "should have found 1 pod with label") + assert.Equal(t, podWithLabel.Name, gotPods.Items[0].Name, "did not find expected pod with label") +} + +func TestGetLoadBalancer(t *testing.T) { + // Create a mock MJS LoadBalancer + client, _ := newFakeClient(t) + lbName := "mjs-load-balancer" + client.config.LoadBalancerName = lbName + lbSpec := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: lbName, + }, + } + lb, err := client.CreateService(&lbSpec) + require.NoError(t, err) + + // Verify that we can get the load balancer + gotLB, err := client.GetLoadBalancer() + require.NoError(t, err) + assert.Equal(t, lb, gotLB) +} + +// Test the job manager pod readiness functions +func TestJobManagerReady(t *testing.T) { + client, fakeK8s := newFakeClient(t) + jmPod := createFakeJobManagerPod(t, fakeK8s) + jmPod = updateJobManagerPodReadiness(t, fakeK8s, jmPod, true) + verifyJobManagerFuncs(t, client, true, jmPod) +} + +// Test the job manager pod functions when the container is not ready +func TestJobManagerContainerNotReady(t *testing.T) { + client, fakeK8s := newFakeClient(t) + jmPod := createFakeJobManagerPod(t, fakeK8s) + jmPod = updateJobManagerPodReadiness(t, fakeK8s, jmPod, false) + verifyJobManagerFuncs(t, client, false, jmPod) +} + +// Test the job manager pod functions when there are multiple pods +// (this can occur if one pod hasn't been cleaned up yet after a restart) +func TestJobManagerMultiplePods(t *testing.T) { + client, fakeK8s := newFakeClient(t) + oldJmPod := createFakeJobManagerPod(t, fakeK8s) + newJmPod := createFakeJobManagerPod(t, fakeK8s) + + // Set the new pod to be ready + updateJobManagerPodReadiness(t, fakeK8s, oldJmPod, false) + newJmPod = updateJobManagerPodReadiness(t, fakeK8s, newJmPod, true) + + // Verify that the new pod shows as ready + verifyJobManagerFuncs(t, client, true, newJmPod) +} + +// Test getting the UID of the controller deployment +func TestGetDeploymentUID(t *testing.T) { + depName := "my-controller" + client, _ := newFakeClient(t) + client.config.DeploymentName = depName + + // Create mock controller deployment + depUID := types.UID("abcd") + spec := appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: depName, + UID: depUID, + }, + } + _, err := client.CreateDeployment(&spec) + require.NoError(t, err) + + // Check we can retrieve the UID + gotUID, err := client.GetControllerDeploymentUID() + require.NoError(t, err) + assert.Equal(t, depUID, gotUID, "did not retrieve expected controller deployment UID") +} + +// Create a Client with a fake underlying Kubernetes client +func newFakeClient(t *testing.T) (*clientImpl, *fake.Clientset) { + fakeK8s := fake.NewSimpleClientset() + conf := config.Config{ + Namespace: namespace, + JobManagerUID: jobManagerUID, + } + return &clientImpl{ + client: fakeK8s, + config: &conf, + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + }, fakeK8s +} + +// Create a fake job manager node pod and install it onto the fake K8s backend +func createFakeJobManagerPod(t *testing.T, fakeK8s *fake.Clientset) *corev1.Pod { + podUID := uuid.New().String() + podSpec := corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("mjs-job-manager-%s", podUID), + Labels: map[string]string{ + specs.JobManagerUIDKey: jobManagerUID, + }, + }, + } + jmPod, err := fakeK8s.CoreV1().Pods(namespace).Create(context.Background(), &podSpec, metav1.CreateOptions{}) + require.NoError(t, err) + return jmPod +} + +func verifyJobManagerFuncs(t *testing.T, client *clientImpl, expectReady bool, expectedPod *corev1.Pod) { + ready, err := client.IsJobManagerReady() + require.NoError(t, err) + assert.Equal(t, expectReady, ready) + + pod, err := client.GetJobManagerPod() + if expectReady { + require.NoError(t, err) + assert.Equal(t, expectedPod, pod, "GetJobManagerPod did not return the expected pod") + } else { + assert.Error(t, err, "expect error from GetJobManagerPod when the pod is not ready") + } +} + +// Update the job manager pod with a given readiness status +func updateJobManagerPodReadiness(t *testing.T, fakeK8s *fake.Clientset, podSpec *corev1.Pod, ready bool) *corev1.Pod { + podSpec.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: specs.JobManagerHostname, + Ready: ready, + }, + } + newPod, err := fakeK8s.CoreV1().Pods(namespace).Update(context.Background(), podSpec, metav1.UpdateOptions{}) + require.NoError(t, err) + return newPod +} diff --git a/controller/src/internal/logging/logging.go b/controller/src/internal/logging/logging.go new file mode 100644 index 0000000..f78f169 --- /dev/null +++ b/controller/src/internal/logging/logging.go @@ -0,0 +1,92 @@ +// Package logging contains functions to create a logger +// Copyright 2024 The MathWorks, Inc. +package logging + +import ( + "os" + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +// Wrapper around a zap logger that allows the log file to be closed +type Logger struct { + *zap.Logger + logFile *os.File +} + +// Gracefully close a logger +func (l *Logger) Close() { + l.Logger.Sync() + if l.logFile != nil { + l.logFile.Sync() + l.logFile.Close() + } +} + +// Create a new logger. If logfile is empty, the logger writes to stdout; otherwise, it writes to a file. +func NewLogger(logfile string, logLevel int) (*Logger, error) { + return createLogger(logfile, getZapLevel(logLevel)) +} + +// Create a logger wrapping an existing zap logger +func NewFromZapLogger(zl *zap.Logger) *Logger { + return &Logger{zl, nil} +} + +const ( + warnLevelThreshold = 1 + infoLevelThreshold = 2 + debugLevelThreshold = 5 +) + +// Convert a log level number to a zap logging level +func getZapLevel(logLevel int) zapcore.Level { + if logLevel >= debugLevelThreshold { + return zapcore.DebugLevel + } else if logLevel >= infoLevelThreshold { + return zapcore.InfoLevel + } else if logLevel >= warnLevelThreshold { + return zapcore.WarnLevel + } + return zapcore.ErrorLevel +} + +const timeFormat = "2006 01 02 15:04:05.000 MST" + +func createLogger(logFile string, level zapcore.Level) (*Logger, error) { + var file *os.File + var err error + useLogFile := logFile != "" + if useLogFile { + file, err = os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + } + if err != nil { + return nil, err + } + + timeEncoder := func(t time.Time, enc zapcore.PrimitiveArrayEncoder) { + enc.AppendString(t.UTC().Format(timeFormat)) + } + + encoder := zapcore.NewConsoleEncoder(zapcore.EncoderConfig{ + EncodeTime: timeEncoder, + EncodeLevel: zapcore.CapitalLevelEncoder, + ConsoleSeparator: " | ", + TimeKey: "ts", + LevelKey: "level", + MessageKey: "msg", + }) + var core zapcore.Core + if useLogFile { + core = zapcore.NewCore(encoder, zapcore.AddSync(file), level) + } else { + core = zapcore.NewCore(encoder, zapcore.AddSync(os.Stdout), level) + } + + return &Logger{ + Logger: zap.New(core), + logFile: file, + }, nil +} diff --git a/controller/src/internal/logging/logging_test.go b/controller/src/internal/logging/logging_test.go new file mode 100644 index 0000000..3737acc --- /dev/null +++ b/controller/src/internal/logging/logging_test.go @@ -0,0 +1,120 @@ +package logging + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zapcore" +) + +// Test logging to an output file +func TestLoggerWithOutfile(t *testing.T) { + testCases := []struct { + name string + logLevel int + expectedZapLevel zapcore.Level + }{ + {"level0", 0, zapcore.ErrorLevel}, + {"level1", 1, zapcore.WarnLevel}, + {"level2", 2, zapcore.InfoLevel}, + {"level3", 3, zapcore.InfoLevel}, + {"level4", 4, zapcore.InfoLevel}, + {"level5", 5, zapcore.DebugLevel}, + {"level6", 6, zapcore.DebugLevel}, + } + for _, tc := range testCases { + t.Run(tc.name, func(tt *testing.T) { + verifyLogFile(tt, tc.logLevel, tc.expectedZapLevel) + }) + } +} + +// Test that we can log to stdout without errors +func TestLoggerStdout(t *testing.T) { + logCloser, err := NewLogger("", 1) + require.NoError(t, err, "Error creating logger without a log file") + require.NotNil(t, logCloser.Logger, "Zap logger should not be nil") + require.Nil(t, logCloser.logFile, "Log file should be nil for logger without an output file") + logCloser.Logger.Info("test message") + logCloser.Close() +} + +// Check we get an error when we cannot open the log file +func TestLoggerError(t *testing.T) { + badFilePath := "/this/does/not/exist.log" + _, err := NewLogger(badFilePath, 1) + assert.Error(t, err, "Should get an error when attempting to create logger with invalid file path") +} + +func verifyLogFile(t *testing.T, logLevel int, expectedZapLevel zapcore.Level) { + outdir := t.TempDir() + outfile := filepath.Join(outdir, "test.log") + var logger *Logger + var err error + logger, err = NewLogger(outfile, logLevel) + require.NoError(t, err, "Error creating logger") + require.NotNil(t, logger.logFile, "Log file should not be nil") + require.NotNil(t, logger.Logger, "Zap logger should not be nil") + + // Write some messages + logMessages := map[zapcore.Level]string{ + zapcore.DebugLevel: "this is a debug message", + zapcore.InfoLevel: "this is an info message", + zapcore.WarnLevel: "this is a warning message", + zapcore.ErrorLevel: "this is an error message", + } + logger.Debug(logMessages[zapcore.DebugLevel]) + logger.Info(logMessages[zapcore.InfoLevel]) + logger.Warn(logMessages[zapcore.WarnLevel]) + logger.Error(logMessages[zapcore.ErrorLevel]) + + // Close the logger + logger.Close() + + // Check the file contents + fileBytes, err := os.ReadFile(outfile) + fileContent := string(fileBytes) + require.NoError(t, err, "Error reading log file") + verifyTimestamps(t, fileContent) + + // Check the correct level of messages are logged + for zapLevel, msg := range logMessages { + zapLevelStr := zapLevel.CapitalString() + if zapLevel >= expectedZapLevel { + verifyLogMessage(t, fileContent, msg, zapLevelStr) + } else { + assert.NotContainsf(t, fileContent, msg, "Log file should not contain message for level %s", zapLevelStr) + } + } +} + +// Check that the log file contents contain a given message +func verifyLogMessage(t *testing.T, fileContent, expectedMsg, level string) { + msgWithLevel := fmt.Sprintf("%s | %s", level, expectedMsg) + assert.Contains(t, fileContent, msgWithLevel, "Log file should contain log level and message") +} + +// Check that the log file contents contain the expected timestamps +func verifyTimestamps(t *testing.T, fileContent string) { + lines := strings.Split(fileContent, "\n") + for _, line := range lines { + if len(line) == 0 { + continue + } + sections := strings.Split(line, " | ") + timestamp := sections[0] + + // Check the timestamp is in UTC + assert.Contains(t, timestamp, "UTC", "Timestamp should be in UTC") + + // Check we can convert the timestamp back into the expected time + _, err := time.Parse(timeFormat, timestamp) + assert.NoErrorf(t, err, "Should be able to convert timestamp \"%\" using the time format \"%s\"", timestamp, timeFormat) + } +} diff --git a/controller/src/internal/request/request.go b/controller/src/internal/request/request.go new file mode 100644 index 0000000..f5dae54 --- /dev/null +++ b/controller/src/internal/request/request.go @@ -0,0 +1,143 @@ +// Package request contains code for getting a cluster's resize request +// Copyright 2024 The MathWorks, Inc. +package request + +import ( + "bytes" + "controller/internal/config" + "controller/internal/k8s" + "controller/internal/logging" + "encoding/json" + "errors" + "fmt" + "os" + "path/filepath" +) + +// Getter is an interface for getting a cluster's resize request +type Getter interface { + GetRequest() (*ResizeRequest, error) +} + +// MJSRequestGetter implements the Getter interface for an MJS cluster in Kubernetes +type MJSRequestGetter struct { + config *config.Config + logger *logging.Logger + client k8s.Client + exitFunc func() +} + +// NewMJSRequestGetter constructs an MJSRequestGetter +func NewMJSRequestGetter(conf *config.Config, logger *logging.Logger) (*MJSRequestGetter, error) { + client, err := k8s.NewClient(conf, logger) + if err != nil { + return nil, err + } + + m := &MJSRequestGetter{ + config: conf, + logger: logger, + client: client, + exitFunc: func() { os.Exit(1) }, + } + return m, nil +} + +// GetRequest excutes the resize script on the job manager to obtain the cluster's resize request +func (m *MJSRequestGetter) GetRequest() (*ResizeRequest, error) { + rawStatus, err := m.execResizeOnJobManager() + if err != nil { + return nil, fmt.Errorf("error getting resize status via Kubernetes exec: %v", err) + } + return m.processRequest(rawStatus.Bytes()) +} + +// execResizeOnJobManager uses the Kubernetes RESTful interface to execute the resize script on the job manager +func (m *MJSRequestGetter) execResizeOnJobManager() (*bytes.Buffer, error) { + // Create command to run + cmd := getResizeStatusCommand(m.config) + + // Find the job manager pod so we can extract its name; note that the name may change over time if the pod is restarted + pod, err := m.client.GetJobManagerPod() + if err != nil { + return nil, err + } + + // Execute the command + return m.client.ExecOnPod(pod.Name, cmd) +} + +// processRequest converts the raw output of "./resize status" into a ResizeRequest struct +func (m *MJSRequestGetter) processRequest(input []byte) (*ResizeRequest, error) { + // Convert the status bytes to a struct + rawStatus := resizeStatus{} + err := json.Unmarshal(input, &rawStatus) + if err != nil { + return nil, fmt.Errorf("error unmarshaling JSON from resize status output. Error: %v. Raw output: \"%s\"", err, string(input)) + } + + // Extract the resize request from the status struct + numJobManagers := len(rawStatus.JobManagers) + if numJobManagers == 0 { + return nil, errors.New("no job managers found running in the job manager pod") + } + if numJobManagers > 1 { + // A previous job manager was drooled in the checkpoint base, so we should error + jobManagerNames := []string{} + for _, jm := range rawStatus.JobManagers { + jobManagerNames = append(jobManagerNames, jm.Name) + } + msg := fmt.Sprintf("Multiple job managers were found running in the job manager pod: %v. This happens when a process from a previous job manager remains in the checkpoint base folder. Uninstall MATLAB Job Scheduler from the Kubernetes cluster, delete the contents of the checkpoint base folder, then try again.", jobManagerNames) + m.logger.Error(msg) + fmt.Println(msg) + m.exitFunc() + } + jobManagerStatus := rawStatus.JobManagers[0] + req := ResizeRequest{} + req.DesiredWorkers = jobManagerStatus.DesiredWorkers.Linux + req.MaxWorkers = jobManagerStatus.MaxWorkers.Linux + req.Workers = jobManagerStatus.Workers + return &req, nil +} + +// ResizeRequest represents a cluster's current and requested number of workers +type ResizeRequest struct { + DesiredWorkers int + MaxWorkers int + Workers []WorkerStatus +} + +// resizeStatus is a struct matching the format of the output of "./resize status" +type resizeStatus struct { + JobManagers []jobManagerStatus +} + +// jobManagerStatus is a struct matching the "JobManagers" field in the output of "./resize status" +type jobManagerStatus struct { + Name string + DesiredWorkers workersPerOS + MaxWorkers workersPerOS + Workers []WorkerStatus +} + +// WorkerStatus is a struct matching the "Workers" field in the job manager output of "./resize status" +type WorkerStatus struct { + Name string + SecondsIdle int + State string +} + +// workersPerOS is a struct matching the output of the DesiredWorkers and MaxWorkers fields in the job manager output of "./resize status" +type workersPerOS struct { + Linux int + Windows int +} + +func getResizeStatusCommand(conf *config.Config) []string { + timeout := k8s.Timeout - 5 // Use a timeout shorter than the Kubernetes client timeout so we don't leave an orphaned process on the pod + cmd := []string{"timeout", fmt.Sprintf("%d", timeout), conf.ResizePath, "status", "-baseport", fmt.Sprintf("%d", conf.BasePort)} + if conf.RequireScriptVerification { + cmd = append(cmd, "-secretfile", filepath.Join(conf.SecretDir, conf.SecretFileName)) + } + return cmd +} diff --git a/controller/src/internal/request/request_test.go b/controller/src/internal/request/request_test.go new file mode 100644 index 0000000..3fc636e --- /dev/null +++ b/controller/src/internal/request/request_test.go @@ -0,0 +1,190 @@ +// Copyright 2024 The MathWorks, Inc. +package request + +import ( + "bytes" + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + + "controller/internal/config" + "controller/internal/logging" + mockClient "controller/mocks/k8s" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Test the GetRequest method +func TestGetRequest(t *testing.T) { + verifyGetRequest(t, false) +} + +// Test the GetRequest method when requireScriptVerification=trye +func TestGetRequestWithScriptVerification(t *testing.T) { + verifyGetRequest(t, true) +} + +func verifyGetRequest(t *testing.T, requireScriptVerification bool) { + conf := config.Config{ + ResizePath: "/path/to/resize/script", + RequireScriptVerification: requireScriptVerification, + SecretDir: "/my/secret", + SecretFileName: "secret.json", + } + requestGetter, client := createRequestGetterWithMockClient(t, &conf) + + // Construct expected response + wantReq := ResizeRequest{ + DesiredWorkers: 10, + MaxWorkers: 50, + Workers: []WorkerStatus{ + { + Name: "worker1", + State: "busy", + SecondsIdle: 0, + }, + { + Name: "worker2", + State: "idle", + SecondsIdle: 30, + }, + }, + } + + // Set up mock client to return the job manager pod + jmPodName := "jmPod" + client.EXPECT().GetJobManagerPod().Once().Return(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: jmPodName}, + }, nil) + + // Check the command we are going to run + expectedCmd := getResizeStatusCommand(requestGetter.config) + secretFileArg := "-secretfile" + assert.Contains(t, expectedCmd, conf.ResizePath, "command should contain path to resize executable") + if requireScriptVerification { + assert.Containsf(t, expectedCmd, secretFileArg, "resize status command should contain %s when requireScriptVerification is true", secretFileArg) + } else { + assert.NotContainsf(t, expectedCmd, secretFileArg, "resize status command should not contain %s when requireScriptVerification is false", secretFileArg) + } + + // Create the raw string needed to get the expected request + rawReq := fmt.Sprintf(` +{ + "jobManagers": [ + { + "name": "myJobManager", + "host": "myhostname", + "desiredWorkers": { + "linux": %d, + "windows": 0 + }, + "maxWorkers": { + "linux": %d, + "windows": 8 + }, + "workers": [ + { + "name": "%s", + "host": "myhostname", + "operatingSystem": "linux", + "state": "%s", + "secondsIdle": %d + }, + { + "name": "%s", + "host": "myhostname", + "operatingSystem": "linux", + "state": "%s", + "secondsIdle": %d + } + ] + } + ] +}`, wantReq.DesiredWorkers, wantReq.MaxWorkers, wantReq.Workers[0].Name, wantReq.Workers[0].State, wantReq.Workers[0].SecondsIdle, wantReq.Workers[1].Name, wantReq.Workers[1].State, wantReq.Workers[1].SecondsIdle) + + // Set up the mock client to return this string + stdOut := bytes.NewBuffer([]byte(rawReq)) + client.EXPECT().ExecOnPod(jmPodName, expectedCmd).Once().Return(stdOut, nil) + + // Check we get the expected request + gotReq, err := requestGetter.GetRequest() + require.NoError(t, err) + assert.Equal(t, wantReq, *gotReq, "unexpectected resize request returned") +} + +func TestGetJobManagerPodErr(t *testing.T) { + requestGetter, client := createRequestGetterWithMockClient(t, &config.Config{}) + errMsg := "could not get job manager pod" + client.EXPECT().GetJobManagerPod().Once().Return(nil, errors.New(errMsg)) + _, err := requestGetter.GetRequest() + assert.Error(t, err, "should get error when GetJobManagerPod errors") + assert.Contains(t, err.Error(), errMsg, "error should contain original error message") +} + +func TestExecError(t *testing.T) { + requestGetter, client := createRequestGetterWithMockClient(t, &config.Config{}) + podName := "jm-pod" + client.EXPECT().GetJobManagerPod().Once().Return(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: podName}}, nil) + errMsg := "could not exec the command" + client.EXPECT().ExecOnPod(podName, mock.Anything).Once().Return(nil, errors.New(errMsg)) + _, err := requestGetter.GetRequest() + assert.Error(t, err, "should get error when GetJobManagerPod errors") + assert.Contains(t, err.Error(), errMsg, "error should contain original error message") +} + +// Check for errors when processing a request that is not valid JSON +func TestProcessStatusInvalidJSON(t *testing.T) { + invalidJSON := "this-is-not-valid" + m, _ := createRequestGetterWithMockClient(t, &config.Config{}) + status, err := m.processRequest([]byte(invalidJSON)) + assert.Error(t, err, "processStatus should error when JSON cannot be unmarshaled") + assert.Nil(t, status, "processStatus should return nil status when JSON cannot be unmarshaled") +} + +// Check that we get an error when there is no job manager +func TestNoJobManagers(t *testing.T) { + rawReq := ` +{ + "jobManagers": [ + ] +}` + m, _ := createRequestGetterWithMockClient(t, &config.Config{}) + status, err := m.processRequest([]byte(rawReq)) + assert.Error(t, err, "processStatus should error when JSON cannot be unmarshaled") + assert.Nil(t, status, "processStatus should return nil status when JSON cannot be unmarshaled") +} + +// Check that we exit when the request contains multiple job managers, which is not allowed +func TestProcessJSONMultipleJobManagers(t *testing.T) { + rawReq := ` +{ + "jobManagers": [ + { + "name": "manager1" + }, + { + "name": "manager2" + } + ] +}` + m, _ := createRequestGetterWithMockClient(t, &config.Config{}) + didExit := false + m.exitFunc = func() { didExit = true } + m.processRequest([]byte(rawReq)) + assert.True(t, didExit, "Process should have exited when multiple job managers were found") +} + +func createRequestGetterWithMockClient(t *testing.T, conf *config.Config) (*MJSRequestGetter, *mockClient.Client) { + client := mockClient.NewClient(t) + return &MJSRequestGetter{ + client: client, + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + config: conf, + }, client +} diff --git a/controller/src/internal/rescaler/rescaler.go b/controller/src/internal/rescaler/rescaler.go new file mode 100644 index 0000000..fe68812 --- /dev/null +++ b/controller/src/internal/rescaler/rescaler.go @@ -0,0 +1,177 @@ +// Package rescaler contains logic for rescaling an MJS cluster in Kubernetes base on its resize status. +// Copyright 2024 The MathWorks, Inc. +package rescaler + +import ( + "controller/internal/config" + "controller/internal/logging" + "controller/internal/request" + "controller/internal/resize" + "controller/internal/specs" + "fmt" + + "go.uber.org/zap" + "k8s.io/apimachinery/pkg/types" +) + +// Rescaler is an interface for performing cluster rescaling +type Rescaler interface { + Rescale() +} + +// MJSRescaler implements the Rescaler interface to rescale an MJS cluster in Kubernetes +type MJSRescaler struct { + logger *logging.Logger + idleStopThreshold int // Time after which idle workers can be stopped + minWorkers int // Minimum number of workers in the cluster at any time + requestGetter request.Getter // Interface to get the cluster's resize request + resizer resize.Resizer // Interface to perform cluster resizing +} + +// NewMJSRescaler constructs an MJSRescaler from a given config struct +func NewMJSRescaler(config *config.Config, ownerUID types.UID, logger *logging.Logger) (*MJSRescaler, error) { + requestGetter, err := request.NewMJSRequestGetter(config, logger) + if err != nil { + return nil, fmt.Errorf("error creating K8s resize status getter: %v", err) + } + + resizer, err := resize.NewMJSResizer(config, ownerUID, logger) + if err != nil { + return nil, fmt.Errorf("error creating K8s cluster resizer: %v", err) + } + + scaler := &MJSRescaler{ + idleStopThreshold: config.IdleStop, + logger: logger, + minWorkers: config.MinWorkers, + requestGetter: requestGetter, + resizer: resizer, + } + return scaler, nil +} + +// Rescale gets the MJS cluster's resize status and resizes the cluster if needed +func (m *MJSRescaler) Rescale() { + // Get the cluster's resize request + status, err := m.requestGetter.GetRequest() + if err != nil { + m.logger.Error("Error getting resize status", zap.Error(err)) + return + } + + // Calculate the desired number of workers + desiredWorkers := status.DesiredWorkers + if desiredWorkers > status.MaxWorkers { + // Note that this scenario should never happen with MJS, so log an error + m.logger.Error("Desired workers should not be larger than max workers", zap.Int("maxWorkers", status.MaxWorkers), zap.Int("desiredWorkers", desiredWorkers)) + desiredWorkers = status.MaxWorkers + } else if desiredWorkers < m.minWorkers { + desiredWorkers = m.minWorkers + } + + // Get a list of the workers currently deployed into K8s. + // This gives us the "true" worker count - these workers have either already connected to MJS, or will eventually connect. + // Any MJS worker not in this list must have already had its deployment deleted, so will soon leave the MJS cluster. + workersInK8s, err := m.resizer.GetWorkers() + numWorkers := len(workersInK8s) + if err != nil { + m.logger.Error("Error getting existing workers from Kubernetes cluster", zap.Error(err)) + return + } + if numWorkers == desiredWorkers { + return + } + + if desiredWorkers < numWorkers { + m.logger.Debug("Reducing number of workers", zap.Int("currentWorkers", numWorkers), zap.Int("desiredWorkers", desiredWorkers)) + toDelete := getWorkersToDelete(numWorkers-desiredWorkers, status.Workers, workersInK8s, m.idleStopThreshold) + if len(toDelete) == 0 { + m.logger.Debug("Did not find any workers available to delete") + return + } + err = m.resizer.DeleteWorkers(toDelete) + if err != nil { + m.logger.Error("Error scaling down cluster", zap.Error(err)) + } + } else { + m.logger.Debug("Increasing number of workers", zap.Int("currentWorkers", numWorkers), zap.Int("desiredWorkers", desiredWorkers)) + toAdd := getWorkersToAdd(desiredWorkers, workersInK8s) + err = m.resizer.AddWorkers(toAdd) + if err != nil { + m.logger.Error("Error scaling up cluster", zap.Error(err)) + } + } +} + +// getWorkersToDelete returns a list of workers that should be deleted +func getWorkersToDelete(numToDelete int, connectedWorkers []request.WorkerStatus, workersInK8s []resize.Worker, idleStopThreshold int) []string { + toDelete := []string{} + inBothLists := getWorkerOverlap(connectedWorkers, workersInK8s) + + // Look for connected workers that have been idle for longer than the idleStopThreshold + for i := len(connectedWorkers) - 1; i >= 0; i-- { + w := connectedWorkers[i] + + // If a worker is no longer present in K8s, it must already be in the process of terminating, so don't try to delete it again + if !inBothLists[w.Name] { + continue + } + + shouldDelete := w.State == "idle" && w.SecondsIdle >= idleStopThreshold + if shouldDelete { + toDelete = append(toDelete, w.Name) + if len(toDelete) == numToDelete { + return toDelete + } + } + } + return toDelete +} + +// getWorkerOverlap returns a map of booleans indicating whether a worker appears in both the list of workers connected to MJS and the list of workers connected to the cluster +func getWorkerOverlap(connectedWorkers []request.WorkerStatus, workersInK8s []resize.Worker) map[string]bool { + isInBoth := map[string]bool{} + + // Insert all of the connected worker names into the map + for _, w := range connectedWorkers { + isInBoth[w.Name] = false + } + + // Insert all of the K8s workers, flipping the value to "true" if the worker is already in the map + for _, w := range workersInK8s { + _, inFirstList := isInBoth[w.Info.Name] + isInBoth[w.Info.Name] = inFirstList + } + + return isInBoth +} + +// getWorkersToAdd computes which workers should be added, starting from the lowest available worker ID, such that we have the desired number of workers +func getWorkersToAdd(desiredWorkers int, workers []resize.Worker) []specs.WorkerInfo { + newWorkers := []specs.WorkerInfo{} + id := 1 + numToAdd := desiredWorkers - len(workers) + + // Create hash map of existing worker names + existingNames := map[string]bool{} + for _, w := range workers { + existingNames[w.Info.Name] = true + } + + for len(newWorkers) < numToAdd { + name := workerIDToName(id) + if !existingNames[name] { + newWorkers = append(newWorkers, specs.WorkerInfo{ + Name: name, + ID: id, + }) + } + id++ + } + return newWorkers +} + +// workerIDToName converts a worker ID to a worker name +func workerIDToName(id int) string { + return fmt.Sprintf("mjs-worker-%d", id) +} diff --git a/controller/src/internal/rescaler/rescaler_test.go b/controller/src/internal/rescaler/rescaler_test.go new file mode 100644 index 0000000..059771d --- /dev/null +++ b/controller/src/internal/rescaler/rescaler_test.go @@ -0,0 +1,280 @@ +// Copyright 2024 The MathWorks, Inc. +package rescaler + +import ( + "controller/internal/logging" + "controller/internal/request" + "controller/internal/resize" + "controller/internal/specs" + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + + mockRequest "controller/mocks/request" + mockResize "controller/mocks/resize" +) + +// Test the Rescale method against mock backends +func TestRescale(t *testing.T) { + idleStopThreshold := 10 + testCases := []struct { + desc string + existingWorkers []int + idleTimes []int + desiredWorkers int + maxWorkers int + shouldDelete []string + shouldAdd []int + }{ + { + "increase_workers", + []int{1}, + []int{idleStopThreshold}, + 2, + 10, + []string{}, + []int{2}, + }, { + "decrease_workers", + []int{1, 2}, + []int{idleStopThreshold, idleStopThreshold}, + 1, + 10, + []string{"mjs-worker-2"}, + []int{}, + }, { + "workers_already_max", + []int{1, 2}, + []int{idleStopThreshold, idleStopThreshold}, + 10, + 2, + []string{}, + []int{}, + }, { + "increase_up_to_max", + []int{1}, + []int{idleStopThreshold}, + 10, + 2, + []string{}, + []int{2}, + }, { + "decrease_idle_too_short", + []int{1}, + []int{idleStopThreshold - 1}, + 0, + 10, + []string{}, + []int{}, + }, + } + for _, testCase := range testCases { + t.Run(testCase.desc, func(t *testing.T) { + // Set up mock resize status + states := []string{} + for i := 0; i < len(testCase.existingWorkers); i++ { + states = append(states, "idle") + } + infos := createWorkerInfos(testCase.existingWorkers) + workers := createWorkerStatuses(t, infos, states, testCase.idleTimes) + resizeOutput := request.ResizeRequest{ + DesiredWorkers: testCase.desiredWorkers, + MaxWorkers: testCase.maxWorkers, + Workers: workers, + } + mockRequestGetter := mockRequest.NewGetter(t) + mockRequestGetter.EXPECT().GetRequest().Return(&resizeOutput, nil).Once() + + // Set up mock resizer + mockResizer := mockResize.NewResizer(t) + mockResizer.EXPECT().GetWorkers().Return(infos, nil).Once() + if len(testCase.shouldDelete) > 0 { + mockResizer.EXPECT().DeleteWorkers(testCase.shouldDelete).Return(nil).Once() + } + if len(testCase.shouldAdd) > 0 { + toAdd := []specs.WorkerInfo{} + for _, id := range testCase.shouldAdd { + toAdd = append(toAdd, specs.WorkerInfo{ + Name: fmt.Sprintf("mjs-worker-%d", id), + ID: id, + }) + } + mockResizer.EXPECT().AddWorkers(toAdd).Return(nil).Once() + } + + // Create scaler with mocks + scaler := MJSRescaler{ + requestGetter: mockRequestGetter, + resizer: mockResizer, + idleStopThreshold: idleStopThreshold, + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + } + scaler.Rescale() + }) + } +} + +func TestRescaleError(t *testing.T) { + statusWithError := mockRequest.NewGetter(t) + statusWithError.EXPECT().GetRequest().Return(nil, errors.New("error")).Once() + mockResizer := mockResize.NewResizer(t) + scaler := MJSRescaler{ + requestGetter: statusWithError, + resizer: mockResizer, + idleStopThreshold: 10, + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + } + scaler.Rescale() +} + +func TestRescaleResizerError(t *testing.T) { + resizerWithError := mockResize.NewResizer(t) + resizerWithError.EXPECT().GetWorkers().Return([]resize.Worker{}, fmt.Errorf("some error")).Once() + + mockRequestGetter := mockRequest.NewGetter(t) + clusterStatus := request.ResizeRequest{ + DesiredWorkers: 2, + MaxWorkers: 3, + Workers: []request.WorkerStatus{ + { + Name: "mjs-worker-2", + State: "idle", + SecondsIdle: 0, + }, + }, + } + mockRequestGetter.EXPECT().GetRequest().Return(&clusterStatus, nil).Once() + + scaler := MJSRescaler{ + requestGetter: mockRequestGetter, + resizer: resizerWithError, + idleStopThreshold: 10, + logger: logging.NewFromZapLogger(zaptest.NewLogger(t)), + } + scaler.Rescale() +} + +// Test the logic of the getWorkersToDelete function in various scenarios +func TestGetWorkersToDelete(t *testing.T) { + threshold := 30 + testCases := []struct { + name string + numToDelete int + mjsWorkers []request.WorkerStatus + k8sWorkers []resize.Worker + shouldDelete []string + }{ + { + // Case where we cannot delete any workers because they're all busy + name: "all_workers_busy", + numToDelete: 2, + mjsWorkers: []request.WorkerStatus{ + {Name: "worker1", State: "busy", SecondsIdle: 0}, + {Name: "worker2", State: "busy", SecondsIdle: 0}, + }, + k8sWorkers: []resize.Worker{ + {Info: specs.WorkerInfo{Name: "worker1"}, IsRunning: true}, + {Info: specs.WorkerInfo{Name: "worker2"}, IsRunning: true}, + }, + shouldDelete: []string{}, + }, { + // Case where we cannot delete any workers because none have been idle for long enough + name: "all_workers_below_idle_threshold", + numToDelete: 1, + mjsWorkers: []request.WorkerStatus{ + {Name: "worker1", State: "idle", SecondsIdle: threshold - 1}, + {Name: "worker2", State: "idle", SecondsIdle: threshold - 1}, + }, + k8sWorkers: []resize.Worker{ + {Info: specs.WorkerInfo{Name: "worker1"}, IsRunning: true}, + {Info: specs.WorkerInfo{Name: "worker2"}, IsRunning: true}, + }, + shouldDelete: []string{}, + }, { + // Case where we delete some idle workers + name: "delete_some_idle_workers", + numToDelete: 3, + mjsWorkers: []request.WorkerStatus{ + {Name: "worker1", State: "idle", SecondsIdle: threshold + 1}, + {Name: "worker2", State: "idle", SecondsIdle: threshold - 1}, + {Name: "worker3", State: "idle", SecondsIdle: threshold + 1}, + {Name: "worker4", State: "busy", SecondsIdle: 0}, + }, + k8sWorkers: []resize.Worker{ + {Info: specs.WorkerInfo{Name: "worker1"}, IsRunning: true}, + {Info: specs.WorkerInfo{Name: "worker2"}, IsRunning: true}, + {Info: specs.WorkerInfo{Name: "worker3"}, IsRunning: true}, + {Info: specs.WorkerInfo{Name: "worker4"}, IsRunning: true}, + }, + shouldDelete: []string{"worker1", "worker3"}, + }, { + // Case where we cannot delete workers that haven't connected yet (see g3278906) + name: "do_not_delete_unconnected_workers", + numToDelete: 5, + mjsWorkers: []request.WorkerStatus{ + {Name: "worker3", State: "busy", SecondsIdle: 0}, + {Name: "worker4", State: "idle", SecondsIdle: threshold + 1}, + }, + k8sWorkers: []resize.Worker{ + {Info: specs.WorkerInfo{Name: "worker1"}, IsRunning: false}, // Pod not running + {Info: specs.WorkerInfo{Name: "worker2"}, IsRunning: true}, // Running but not connected to MJS yet + {Info: specs.WorkerInfo{Name: "worker3"}, IsRunning: true}, // Running and connected to MJS + {Info: specs.WorkerInfo{Name: "worker4"}, IsRunning: true}, // Running and connected to MJS + {Info: specs.WorkerInfo{Name: "worker5"}, IsRunning: false}, // Pod not running + }, + shouldDelete: []string{"worker4"}, + }, { + // Case where some workers are still connected to MJS but have already been deleted from K8s, so we should not try to delete them again + name: "workers_already_removed_from_k8s", + numToDelete: 2, + mjsWorkers: []request.WorkerStatus{ + {Name: "worker3", State: "idle", SecondsIdle: threshold + 1}, + {Name: "worker4", State: "idle", SecondsIdle: threshold + 1}, + }, + k8sWorkers: []resize.Worker{ + {Info: specs.WorkerInfo{Name: "worker3"}, IsRunning: true}, + // No K8s entry for worker4 - it is already being terminated + }, + shouldDelete: []string{"worker3"}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + toDelete := getWorkersToDelete(tc.numToDelete, tc.mjsWorkers, tc.k8sWorkers, threshold) + assert.LessOrEqual(t, len(toDelete), tc.numToDelete, "Number of workers to delete should never be greater than the requested number to delete") + require.ElementsMatch(t, tc.shouldDelete, toDelete, "Unexpected result from getWorkerToDelete") + }) + } +} + +func createWorkerStatuses(t *testing.T, inputWorkers []resize.Worker, states []string, idleTimes []int) []request.WorkerStatus { + assert.Equal(t, len(inputWorkers), len(states), "states must be same length as input workers") + assert.Equal(t, len(inputWorkers), len(idleTimes), "idleTimes must be same length as input workers") + statuses := []request.WorkerStatus{} + for idx, w := range inputWorkers { + workerStatus := request.WorkerStatus{ + Name: w.Info.Name, + State: states[idx], + SecondsIdle: idleTimes[idx], + } + statuses = append(statuses, workerStatus) + } + return statuses +} + +func createWorkerInfos(ids []int) []resize.Worker { + workers := []resize.Worker{} + for _, id := range ids { + workers = append(workers, resize.Worker{ + Info: specs.WorkerInfo{ + Name: fmt.Sprintf("mjs-worker-%d", id), + ID: id, + }}) + } + return workers +} diff --git a/controller/src/internal/resize/resize.go b/controller/src/internal/resize/resize.go new file mode 100644 index 0000000..26bf7bc --- /dev/null +++ b/controller/src/internal/resize/resize.go @@ -0,0 +1,404 @@ +// Package resize contains code for resizing an MJS cluster in Kubernetes +// Copyright 2024 The MathWorks, Inc. +package resize + +import ( + "controller/internal/config" + "controller/internal/k8s" + "controller/internal/logging" + "controller/internal/specs" + "encoding/json" + "errors" + "fmt" + "strconv" + + "github.com/mathworks/mjssetup/pkg/certificate" + "go.uber.org/zap" + appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/types" +) + +// Resize is an interface for resizing a cluster +type Resizer interface { + GetWorkers() ([]Worker, error) + AddWorkers([]specs.WorkerInfo) error + DeleteWorkers([]string) error +} + +// Worker represents a worker currently deployed in the Kubernetes cluster +type Worker struct { + Info specs.WorkerInfo // Metadata for this worker + IsRunning bool // Whether a worker deployment has started running a pod +} + +// MJSResizer implements resizing of an MJS cluster in Kubernetes +type MJSResizer struct { + config *config.Config + logger *logging.Logger + client k8s.Client + specFactory *specs.SpecFactory +} + +// NewMJSResizer constructs an MJSResizer +func NewMJSResizer(conf *config.Config, uid types.UID, logger *logging.Logger) (*MJSResizer, error) { + client, err := k8s.NewClient(conf, logger) + if err != nil { + return nil, err + } + m := MJSResizer{ + config: conf, + logger: logger, + client: client, + } + m.specFactory = specs.NewSpecFactory(conf, uid) + return &m, nil +} + +// AddWorkers adds workers to the Kubernetes cluster +func (m *MJSResizer) AddWorkers(workers []specs.WorkerInfo) error { + m.logger.Info("Adding workers", zap.Any("workers", workers)) + if m.config.UsePoolProxy() { + return m.addWorkersAndProxies(workers) + } + m.addWorkersForInternalCluster(workers) + return nil +} + +// GetWorkers returns a list of the MJS workers currently running on the Kubernetes cluster; this list is determined by examining the worker deployments present on the cluster +func (m *MJSResizer) GetWorkers() ([]Worker, error) { + deployments, err := m.client.GetDeploymentsWithLabel(fmt.Sprintf("%s=%s", specs.AppKey, specs.WorkerLabels.AppLabel)) + if err != nil { + return []Worker{}, err + } + workers := getWorkersFromDeployments(deployments, m.logger) + return workers, nil +} + +// DeleteWorkers deletes a list of MJS workers from a Kubernetes cluster +func (m *MJSResizer) DeleteWorkers(names []string) error { + m.logger.Info("Deleting workers", zap.Any("workers", names)) + existingWorkers, err := m.GetWorkers() + if err != nil { + return fmt.Errorf("error getting existing workers: %v", err) + } + + // Create a hash map of existing workers for efficient lookup + existingWorkerMap := map[string]specs.WorkerInfo{} + for _, w := range existingWorkers { + existingWorkerMap[w.Info.Name] = w.Info + } + + // Attempt to delete each worker in the list + deletedWorkerIDMap := map[int]bool{} + for _, name := range names { + workerInfo, exists := existingWorkerMap[name] + if exists { + err := m.deleteWorker(workerInfo.HostName) + if err != nil { + m.logger.Error("Error deleting worker", zap.String("hostname", workerInfo.HostName), zap.Error(err)) + } else if m.config.UsePoolProxy() { + // Track successfully deleted workers so we know which pool proxies to delete later + deletedWorkerIDMap[workerInfo.ID] = true + } + } + } + + // Clean up resources associated with the workers we successfully deleted + if m.config.UsePoolProxy() { + m.deleteProxiesIfNotNeeded(existingWorkers, deletedWorkerIDMap) + } + return nil +} + +// addWorkersAndProxies adds a list of workers plus any parallel pool proxies they need +func (m *MJSResizer) addWorkersAndProxies(workers []specs.WorkerInfo) error { + newProxiesNeeded, noProxyNeeded, err := m.getPoolProxiesNeededForWorkers(workers) + if err != nil { + return err + } + + // First, add workers that do not need a new parallel pool proxy + for _, w := range noProxyNeeded { + err := m.addWorker(&w) + if err != nil { + logAddWorkerError(m.logger, &w, err) + } + } + + // Next, add each parallel pool proxy and then its associated workers + for p, workers := range newProxiesNeeded { + proxy := specs.NewPoolProxyInfo(p, m.config.PoolProxyBasePort) + err := m.addPoolProxy(proxy) + if err != nil { + m.logger.Error("error adding parallel pool proxy", zap.String("name", proxy.Name), zap.Int("ID", proxy.ID), zap.Error(err)) + // We should not add the workers associated with this proxy, since proxy creation failed + continue + } + + workersWereAdded := false + for _, w := range workers { + err := m.addWorker(&w) + if err != nil { + logAddWorkerError(m.logger, &w, err) + } else { + workersWereAdded = true + } + } + + // Clean up the proxy if none of its associated workers were successfully added + if !workersWereAdded { + m.deletePoolProxy(proxy.Name) + } + } + return nil +} + +// getPoolProxiesNeededForWorkers returns a map of parallel pool proxies to create and the workers dependent on them, plus a list of workers that do not need a new proxy +func (m *MJSResizer) getPoolProxiesNeededForWorkers(workers []specs.WorkerInfo) (map[int][]specs.WorkerInfo, []specs.WorkerInfo, error) { + existingProxies, err := m.getPoolProxies() + if err != nil { + return nil, nil, err + } + + // Create hash map of existing proxies + existingProxyIDs := map[int]bool{} + for _, p := range existingProxies { + existingProxyIDs[p.ID] = true + } + + // Check which proxy is needed by each worker + newProxiesNeeded := map[int][]specs.WorkerInfo{} + noProxyNeeded := []specs.WorkerInfo{} + for _, w := range workers { + p := m.specFactory.CalculatePoolProxyForWorker(w.ID) + if existingProxyIDs[p] { + noProxyNeeded = append(noProxyNeeded, w) + } else { + newProxiesNeeded[p] = append(newProxiesNeeded[p], w) + } + } + return newProxiesNeeded, noProxyNeeded, nil +} + +// addWorkers adds a list of workers, without adding pool proxies or exposing worker ports +func (m *MJSResizer) addWorkersForInternalCluster(workers []specs.WorkerInfo) { + for _, w := range workers { + err := m.addWorker(&w) + if err != nil { + logAddWorkerError(m.logger, &w, err) + } + } +} + +// addWorker adds a single MJS worker to the Kubernetes cluster +func (m *MJSResizer) addWorker(w *specs.WorkerInfo) error { + w.GenerateUniqueHostName() + svcSpec := m.specFactory.GetWorkerServiceSpec(w) + _, err := m.client.CreateService(svcSpec) + if err != nil { + return err + } + _, err = m.client.CreateDeployment(m.specFactory.GetWorkerDeploymentSpec(w)) + if err != nil { + m.logger.Error("Error creating pod", zap.Error(err)) + m.logger.Debug("Cleaning up worker service since pod creation failed") + errDeleteSvc := m.client.DeleteService(svcSpec.Name) + if errDeleteSvc != nil { + m.logger.Error("Error cleaning up service after failed pod creation", zap.Error(err)) + } + } + return err +} + +// deleteWorker removes a single MJS worker from the Kubernetes cluster +func (m *MJSResizer) deleteWorker(name string) error { + err := m.client.DeleteDeployment(name) + if err != nil { + // If we failed to delete the deployment, we should not proceed to deleting the service, as this will leave an orphaned deployment + return err + } + err = m.client.DeleteService(name) + if err != nil { + return err + } + return nil +} + +// getPoolProxies gets a list of parallel pool proxies currently running on the cluster; this list is determined by examining the deployments present on the cluster +func (m *MJSResizer) getPoolProxies() ([]specs.PoolProxyInfo, error) { + deployments, err := m.client.GetDeploymentsWithLabel(fmt.Sprintf("%s=%s", specs.AppKey, specs.PoolProxyLabels.AppLabel)) + if err != nil { + return []specs.PoolProxyInfo{}, err + } + existingProxies := getProxiesFromDeployments(deployments, m.logger) + return existingProxies, nil +} + +// addPoolProxy adds a parallel pool proxy to the Kubernetes cluster +func (m *MJSResizer) addPoolProxy(proxy specs.PoolProxyInfo) error { + if m.config.UseSecureCommunication { + err := m.createProxyCertificate(proxy.Name) + if err != nil { + m.logger.Error("Error creating certificate for pool proxy", zap.Error(err)) + return err + } + } + + _, err := m.client.CreateDeployment(m.specFactory.GetPoolProxyDeploymentSpec(&proxy)) + if err != nil { + m.logger.Error("Error creating parallel pool proxy deployment", zap.Error(err)) + return err + } + + return err +} + +// createProxyCertificate creates a Kubernetes secret containing a certificate for workers to use when connecting to the proxy +func (m *MJSResizer) createProxyCertificate(name string) error { + // Generate the certificate + certCreator := certificate.New() + sharedSecret, err := certCreator.CreateSharedSecret() + if err != nil { + return err + } + certificate, err := certCreator.GenerateCertificate(sharedSecret) + if err != nil { + return err + } + + // Create spec for Kubernetes secret containing this certificate + certBytes, err := json.Marshal(certificate) + if err != nil { + return fmt.Errorf("error marshaling certificate: %v", err) + } + secretSpec := m.specFactory.GetSecretSpec(name) + secretSpec.Data[specs.ProxyCertFileName] = certBytes + + // Create the Kubernetes secret + _, err = m.client.CreateSecret(secretSpec) + if err != nil { + return err + } + return nil +} + +// deleteProxiesIfNotNeeded deletes any proxies that are no longer needed after worker deletion +func (m *MJSResizer) deleteProxiesIfNotNeeded(originalWorkers []Worker, deletedIDs map[int]bool) error { + toKeep := map[int]bool{} + for _, worker := range originalWorkers { + wasDeleted := deletedIDs[worker.Info.ID] + if !wasDeleted { + toKeep[m.specFactory.CalculatePoolProxyForWorker(worker.Info.ID)] = true + } + } + + existingProxies, err := m.getPoolProxies() + if err != nil { + return err + } + + for _, proxy := range existingProxies { + shouldKeep := toKeep[proxy.ID] + if !shouldKeep { + m.deletePoolProxy(proxy.Name) + } + } + return nil +} + +// deletePoolProxy removes a parallel pool proxy from the Kubernetes cluster +func (m *MJSResizer) deletePoolProxy(proxyName string) error { + m.logger.Info("Deleting parallel pool proxy", zap.String("name", proxyName)) + if m.config.UseSecureCommunication { + err := m.client.DeleteSecret(proxyName) + if err != nil { + return err + } + } + return m.client.DeleteDeployment(proxyName) +} + +// getWorkersFromDeployments converts a list of worker deployments to a list of worker details +func getWorkersFromDeployments(deployments *appsv1.DeploymentList, logger *logging.Logger) []Worker { + var workers []Worker + for _, d := range deployments.Items { + // Make sure the required labels are present + err := checkLabelsExist(d.Labels, []string{specs.WorkerLabels.Name, specs.WorkerLabels.ID, specs.WorkerLabels.HostName}) + if err != nil { + logger.Error("Worker deployment has missing label", zap.Error(err), zap.String("name", d.Name)) + } + + // Convert the worker ID label to an int + workerID, err := getIntFromLabel(d.Labels, specs.WorkerLabels.ID, logger) + if err != nil { + continue + } + + // Populate worker information from labels + workers = append(workers, Worker{ + Info: specs.WorkerInfo{ + Name: d.Labels[specs.WorkerLabels.Name], + ID: workerID, + HostName: d.Labels[specs.WorkerLabels.HostName], + }, + IsRunning: d.Status.ReadyReplicas == 1, + }) + } + return workers +} + +// getProxiesFromDeployments converts a list of proxy deployments to a list of proxy details +func getProxiesFromDeployments(deployments *appsv1.DeploymentList, logger *logging.Logger) []specs.PoolProxyInfo { + var proxies []specs.PoolProxyInfo + for _, d := range deployments.Items { + // Make sure the required labels are present + err := checkLabelsExist(d.Labels, []string{specs.PoolProxyLabels.Name, specs.PoolProxyLabels.ID, specs.PoolProxyLabels.Port}) + if err != nil { + logger.Error("Proxy deployment has missing label", zap.Error(err), zap.String("name", d.Name)) + } + + // Convert labels to ints + proxyID, err := getIntFromLabel(d.Labels, specs.PoolProxyLabels.ID, logger) + if err != nil { + continue + } + port, err := getIntFromLabel(d.Labels, specs.PoolProxyLabels.Port, logger) + if err != nil { + continue + } + + // Populate proxy information from labels + proxies = append(proxies, specs.PoolProxyInfo{ + Name: d.Labels[specs.PoolProxyLabels.Name], + ID: proxyID, + Port: port, + }) + } + return proxies +} + +// checkLabelsExist errors if every element in a list of labels is not present in a label map +func checkLabelsExist(labels map[string]string, checkFor []string) error { + for _, label := range checkFor { + _, ok := labels[label] + if !ok { + return fmt.Errorf("missing label %s", label) + } + } + return nil +} + +// getIntFromLabel extracts an int from a label map and returns an error if the conversion fails +func getIntFromLabel(labels map[string]string, key string, logger *logging.Logger) (int, error) { + resultStr := labels[key] + result, err := strconv.Atoi(resultStr) + if err != nil { + logger.Error("label could not be converted to int", zap.String("label", key), zap.String("value", resultStr), zap.Error(err)) + return 0, errors.New("invalid label") + } + return result, nil +} + +func logAddWorkerError(logger *logging.Logger, w *specs.WorkerInfo, err error) { + logger.Error("error adding worker", zap.String("name", w.Name), zap.Int("ID", w.ID), zap.String("hostname", w.HostName), zap.Error(err)) +} diff --git a/controller/src/internal/resize/resize_test.go b/controller/src/internal/resize/resize_test.go new file mode 100644 index 0000000..6f06392 --- /dev/null +++ b/controller/src/internal/resize/resize_test.go @@ -0,0 +1,684 @@ +// Copyright 2024 The MathWorks, Inc. +package resize + +import ( + "controller/internal/config" + "controller/internal/k8s" + "controller/internal/logging" + "controller/internal/specs" + "encoding/json" + "errors" + "fmt" + "testing" + + "github.com/mathworks/mjssetup/pkg/certificate" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap/zaptest" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + k8sTesting "k8s.io/client-go/testing" +) + +// Test the functions to add, get and delete workers +func TestWorkerFuncs(t *testing.T) { + resizer, _ := getResizerWithFakeClient(t) + verifyGetWorkersResponse(t, resizer, []Worker{}) + + // Add a worker + workerInfo := specs.WorkerInfo{Name: "worker1", ID: 1} + worker := Worker{Info: workerInfo, IsRunning: false} + err := resizer.addWorker(&workerInfo) + assert.NoError(t, err, "error adding worker") + verifyNumWorkersInK8s(t, resizer, 1) + verifyDeploymentAndService(t, resizer, workerInfo.HostName) + verifyGetWorkersResponse(t, resizer, []Worker{worker}) + + // Add a second worker + workerInfo2 := specs.WorkerInfo{Name: "worker2", ID: 2} + worker2 := Worker{Info: workerInfo2, IsRunning: false} + err = resizer.addWorker(&workerInfo2) + assert.NoError(t, err, "error adding worker") + verifyNumWorkersInK8s(t, resizer, 2) + verifyDeploymentAndService(t, resizer, workerInfo2.HostName) + verifyGetWorkersResponse(t, resizer, []Worker{worker, worker2}) + + // Check we can delete the first worker + err = resizer.deleteWorker(workerInfo.HostName) + assert.NoError(t, err, "error deleting worker") + verifyNumWorkersInK8s(t, resizer, 1) + verifyDeploymentAndService(t, resizer, workerInfo2.HostName) + verifyGetWorkersResponse(t, resizer, []Worker{worker2}) +} + +// verify GetWorkers returns the expected list of workers +func verifyGetWorkersResponse(t *testing.T, resizer *MJSResizer, expectedWorkers []Worker) { + gotWorkers, err := resizer.GetWorkers() + assert.NoError(t, err, "error getting workers") + require.Equal(t, len(expectedWorkers), len(gotWorkers), "unexpected number of workers") + + // Create hash map of found workers + foundWorkers := map[string]bool{} + for _, w := range gotWorkers { + foundWorkers[w.Info.Name] = true + } + + // Check we found all expected workers + if len(expectedWorkers) > 0 { + for _, w := range expectedWorkers { + found := foundWorkers[w.Info.Name] + require.Truef(t, found, "worker %s not found in GetWorkers response", w.Info.Name) + } + } +} + +// verify that K8s has the expected number of worker resources +func verifyNumWorkersInK8s(t *testing.T, resizer *MJSResizer, n int) { + workerMatch := fmt.Sprintf("%s=%s", specs.AppKey, specs.WorkerLabels.AppLabel) + verifyNumDeployments(t, resizer, n, workerMatch) + verifyNumServices(t, resizer, n, workerMatch) +} + +// verify that a K8s deployment and service with a given name exist +func verifyDeploymentAndService(t *testing.T, resizer *MJSResizer, name string) { + verifyDeploymentExists(t, resizer, name) + verifyServiceExists(t, resizer, name) +} + +// verify that a K8s deployment and secret for a given proxy exist +func verifyProxyResources(t *testing.T, resizer *MJSResizer, name string, useSecureCommunication bool) { + verifyDeploymentExists(t, resizer, name) + if useSecureCommunication { + verifyProxySecret(t, resizer, name) + } else { + verifySecretDoesNotExist(t, resizer, name) + } +} + +// Test the functions to add, get and delete proxies +func TestProxyFuncs(t *testing.T) { + testCases := []struct { + name string + useSecureCommunication bool + }{ + {"secure", true}, + {"insecure", false}, + } + for _, tc := range testCases { + t.Run(tc.name, func(tt *testing.T) { + verifyProxyFuncs(tt, tc.useSecureCommunication) + }) + } +} + +// Test proxy functions with or without secure communication +func verifyProxyFuncs(t *testing.T, useSecureCommunication bool) { + resizer, _ := getResizerWithFakeClient(t) + resizer.config.UseSecureCommunication = useSecureCommunication + verifyGetProxiesResponse(t, resizer, []specs.PoolProxyInfo{}) + + // Add a proxy + proxy := specs.NewPoolProxyInfo(5, resizer.config.PoolProxyBasePort) + err := resizer.addPoolProxy(proxy) + assert.NoError(t, err, "error adding proxy") + verifyNumProxiesInK8s(t, resizer, 1) + verifyProxyResources(t, resizer, proxy.Name, useSecureCommunication) + verifyGetProxiesResponse(t, resizer, []specs.PoolProxyInfo{proxy}) + + // Add a second proxy + proxy2 := specs.NewPoolProxyInfo(2, resizer.config.PoolProxyBasePort) + err = resizer.addPoolProxy(proxy2) + assert.NoError(t, err, "error adding proxy") + verifyNumProxiesInK8s(t, resizer, 2) + verifyProxyResources(t, resizer, proxy2.Name, useSecureCommunication) + verifyGetProxiesResponse(t, resizer, []specs.PoolProxyInfo{proxy, proxy2}) + + // Check we can delete the first proxy + err = resizer.deletePoolProxy(proxy.Name) + assert.NoError(t, err, "error deleting proxy") + verifySecretDoesNotExist(t, resizer, proxy.Name) + verifyNumProxiesInK8s(t, resizer, 1) + verifyProxyResources(t, resizer, proxy2.Name, useSecureCommunication) + verifyGetProxiesResponse(t, resizer, []specs.PoolProxyInfo{proxy2}) +} + +// verify getProxies returns the expected list of proxies +func verifyGetProxiesResponse(t *testing.T, resizer *MJSResizer, expectedProxies []specs.PoolProxyInfo) { + proxies, err := resizer.getPoolProxies() + assert.NoError(t, err, "error getting proxies") + require.Equal(t, len(expectedProxies), len(proxies), "unexpected number of workers") + if len(expectedProxies) > 0 { + for _, proxy := range expectedProxies { + found := false + for _, gotProxy := range proxies { + if proxy == gotProxy { + found = true + break + } + } + require.Truef(t, found, "proxy %s not found in getProxies response", proxy.Name) + } + } +} + +// verify that K8s has the expected number of proxy resources +func verifyNumProxiesInK8s(t *testing.T, resizer *MJSResizer, n int) { + proxyMatch := fmt.Sprintf("%s=%s", specs.AppKey, specs.PoolProxyLabels.AppLabel) + verifyNumDeployments(t, resizer, n, proxyMatch) +} + +func TestGetProxiesNeededForWorkers(t *testing.T) { + testCases := []struct { + name string + addedWorkerIDs []int + workersPerPoolProxy int + existingProxies []int + expectedNewProxies []int + }{ + { + "oneWorker", []int{1}, 1, []int{}, []int{1}, + }, { + "manyWorkersOneProxy", []int{1, 2, 3}, 3, []int{}, []int{1}, + }, { + "manyWorkersManyProxies", []int{2, 4, 6}, 2, []int{}, []int{1, 2, 3}, + }, { + "noNewProxiesNeeded", []int{1, 2}, 2, []int{1}, []int{}, + }, { + "existingProxiesMoreNeeded", []int{2, 4, 6}, 2, []int{2}, []int{1, 3}, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + conf := config.Config{ + WorkersPerPoolProxy: tc.workersPerPoolProxy, + } + resizer, _ := getResizerWithFakeClientAndConfig(t, &conf) + + // add pre-existing proxies + proxies := []specs.PoolProxyInfo{} + for _, p := range tc.existingProxies { + proxy := specs.NewPoolProxyInfo(p, resizer.config.PoolProxyBasePort) + resizer.addPoolProxy(proxy) + proxies = append(proxies, proxy) + } + verifyGetProxiesResponse(t, resizer, proxies) + + // test the getProxiesNeededForWorkers method + addedWorkerInfos := []specs.WorkerInfo{} + for _, w := range tc.addedWorkerIDs { + addedWorkerInfos = append(addedWorkerInfos, specs.WorkerInfo{ID: w}) + } + newProxies, noProxyNeeded, err := resizer.getPoolProxiesNeededForWorkers(addedWorkerInfos) + assert.NoError(t, err) + + // Check we get the expected list of proxies to add + newProxyIDs := []int{} + for p := range newProxies { + newProxyIDs = append(newProxyIDs, p) + } + assert.ElementsMatch(t, tc.expectedNewProxies, newProxyIDs, "Did not get expected proxies to add") + + // Check that the lists together contain all of the added workers + numWorkersWithProxies := 0 + for _, workers := range newProxies { + numWorkersWithProxies += len(workers) + } + assert.Equal(t, len(tc.addedWorkerIDs), numWorkersWithProxies+len(noProxyNeeded), "Number of workers with proxies + number of workers that do not need a proxy should equal the number of added workers") + }) + } +} + +// check that deleteProxiesIfNotNeeded deletes the expected proxies when workers are removed +func TestDeleteProxiesIfNotNeeded(t *testing.T) { + testCases := []struct { + name string + workersPerPoolProxy int + origWorkerIDs []int + deletedWorkerIDs []int + existingProxies []int + expectedFinalProxies []int + }{ + {"allProxiesStillNeeded", 2, []int{1, 2, 3, 4}, []int{2, 3}, []int{1, 2}, []int{1, 2}}, + {"removeSomeProxies", 2, []int{1, 2, 3, 4, 5, 6, 7, 8}, []int{1, 3, 4, 6, 7, 8}, []int{1, 2, 3, 4}, []int{1, 3}}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + conf := config.Config{ + WorkersPerPoolProxy: tc.workersPerPoolProxy, + } + resizer, _ := getResizerWithFakeClientAndConfig(t, &conf) + + // Create the pre-existing proxies + proxies := []specs.PoolProxyInfo{} + for _, p := range tc.existingProxies { + proxy := specs.NewPoolProxyInfo(p, resizer.config.PoolProxyBasePort) + resizer.addPoolProxy(proxy) + proxies = append(proxies, proxy) + } + verifyGetProxiesResponse(t, resizer, proxies) + + // Trigger proxies to be deleted + origWorkers := []Worker{} + for _, w := range tc.origWorkerIDs { + origWorkers = append(origWorkers, Worker{Info: specs.WorkerInfo{Name: "testworker", ID: w}}) + } + deletedIDsMap := map[int]bool{} + for _, w := range tc.deletedWorkerIDs { + deletedIDsMap[w] = true + } + resizer.deleteProxiesIfNotNeeded(origWorkers, deletedIDsMap) + + // Check the remaining proxies are as expected + finalProxies := []specs.PoolProxyInfo{} + for _, ep := range tc.expectedFinalProxies { + finalProxies = append(finalProxies, specs.NewPoolProxyInfo(ep, resizer.config.PoolProxyBasePort)) + } + verifyGetProxiesResponse(t, resizer, finalProxies) + }) + } +} + +// end-to-end test of cluster scaling up and down +func TestClusterScaling(t *testing.T) { + testCases := []struct { + name string + useProxy bool + }{ + {"use_proxy", true}, + {"no_proxy", false}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + verifyEndToEndScaling(t, tc.useProxy) + }) + } +} + +func verifyEndToEndScaling(t *testing.T, useProxy bool) { + conf := config.Config{ + InternalClientsOnly: !useProxy, + WorkersPerPoolProxy: 2, + } + resizer, _ := getResizerWithFakeClientAndConfig(t, &conf) + + // Check there are no workers initially + verifyClusterState(t, resizer, []specs.WorkerInfo{}, useProxy) + + // Add some workers + workers1 := []specs.WorkerInfo{ + {Name: "worker1", ID: 1}, + {Name: "worker2", ID: 2}, + {Name: "worker3", ID: 3}, + } + err := resizer.AddWorkers(workers1) + assert.NoError(t, err) + verifyClusterState(t, resizer, workers1, useProxy) + + // Add some more workers + workers2 := []specs.WorkerInfo{ + {Name: "worker4", ID: 4}, + {Name: "worker5", ID: 5}, + } + resizer.AddWorkers(workers2) + allWorkers := append(workers1, workers2...) + verifyClusterState(t, resizer, allWorkers, useProxy) + + // Delete some workers + toDelete := []string{"worker2", "worker4"} + resizer.DeleteWorkers(toDelete) + + // Check remaining workers + remainingWorkers := []specs.WorkerInfo{} + remainingWorkerNames := []string{} + for _, w := range allWorkers { + keeping := true + for _, d := range toDelete { + if d == w.Name { + keeping = false + break + } + } + if keeping { + remainingWorkers = append(remainingWorkers, w) + remainingWorkerNames = append(remainingWorkerNames, w.Name) + } + } + assert.Equal(t, len(allWorkers)-len(toDelete), len(remainingWorkers), "remaining workers list has incorrect length") + verifyClusterState(t, resizer, remainingWorkers, useProxy) + + // Clean up remaining workers + resizer.DeleteWorkers(remainingWorkerNames) + verifyClusterState(t, resizer, []specs.WorkerInfo{}, useProxy) +} + +// verify that K8s cluster has resources corresponding to a set of workers, including the required proxies if using proxies +func verifyClusterState(t *testing.T, resizer *MJSResizer, workers []specs.WorkerInfo, usingProxy bool) { + + // verify workers + verifyNumWorkersInK8s(t, resizer, len(workers)) + proxies := []specs.PoolProxyInfo{} + for _, w := range workers { + // note that AddWorkers generates unique names for the worker deployments, so we should check using the worker name label rather than the deployment name + verifyDeploymentAndServiceForWorkerName(t, resizer, w.Name) + + if usingProxy { + // Verify that we have the required proxy for this worker + proxy := specs.NewPoolProxyInfo(resizer.specFactory.CalculatePoolProxyForWorker(w.ID), resizer.config.PoolProxyBasePort) + verifyDeploymentExists(t, resizer, proxy.Name) + + // add to list of proxies to check on load balancer + isInList := false + for _, p := range proxies { + if p.ID == proxy.ID { + isInList = true + break + } + } + if !isInList { + proxies = append(proxies, proxy) + } + } + } +} + +// Get an MJSResizer with a mocked-out Kubernetes backend +func getResizerWithFakeClient(t *testing.T) (*MJSResizer, *fake.Clientset) { + conf := config.Config{} + return getResizerWithFakeClientAndConfig(t, &conf) +} + +// Get an MJSResizer with a mocked-out Kubernetes backend for a given config struct +func getResizerWithFakeClientAndConfig(t *testing.T, conf *config.Config) (*MJSResizer, *fake.Clientset) { + fakeK8s := fake.NewSimpleClientset() + + // Add some standard config settings + lbName := "test-proxy" + namespace := "test-ns" + conf.BasePort = 27350 + conf.PortsPerWorker = 2 + conf.LoadBalancerName = lbName + conf.Namespace = namespace + + // Add dummy LoadBalancer service + logger := logging.NewFromZapLogger(zaptest.NewLogger(t)) + fakeClient := k8s.NewClientWithK8sBackend(conf, fakeK8s, logger) + lb := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: lbName, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeLoadBalancer, + }, + } + _, err := fakeClient.CreateService(&lb) + require.NoError(t, err, "Error creating load balancer service against fake K8s client") + + return &MJSResizer{ + client: fakeClient, + logger: logger, + config: conf, + specFactory: specs.NewSpecFactory(conf, "abcd"), + }, fakeK8s +} + +// verify that a deployment of a given name exists +func verifyDeploymentExists(t *testing.T, resizer *MJSResizer, name string) { + dep, err := resizer.client.GetDeployment(name) + assert.NoError(t, err, "error getting deployment") + assert.NotNil(t, dep, "deployment should not be nil") +} + +// verify that a service of a given name exists +func verifyServiceExists(t *testing.T, resizer *MJSResizer, name string) { + svc, err := resizer.client.GetService(name) + assert.NoError(t, err, "error getting service") + assert.NotNil(t, svc, "service should not be nil") +} + +// verify that a proxy secret exists and contains the expected data +func verifyProxySecret(t *testing.T, resizer *MJSResizer, name string) { + secret, exists, err := resizer.client.SecretExists(name) + require.NoError(t, err, "error getting secret") + require.Truef(t, exists, "secret %s should exist", name) + require.NotNil(t, secret, "secret should not be nil") + + // Check the secret's contents + assert.Contains(t, secret.Data, specs.ProxyCertFileName, "proxy secret should contain certificate file") + certBytes := secret.Data[specs.ProxyCertFileName] + assert.NotEmpty(t, certBytes, "proxy secret should not be empty") + gotCert := certificate.Certificate{} + err = json.Unmarshal(certBytes, &gotCert) + assert.NoError(t, err, "error unmarshaling proxy certificate") + assert.NotEmpty(t, gotCert.ClientCert) + assert.NotEmpty(t, gotCert.ClientKey) + assert.NotEmpty(t, gotCert.ServerCert) +} + +// verify that a secret of a given name does not exist +func verifySecretDoesNotExist(t *testing.T, resizer *MJSResizer, name string) { + _, exists, err := resizer.client.SecretExists(name) + require.NoError(t, err, "error getting secret") + assert.Falsef(t, exists, "secret %s should not exist", name) +} + +// verify that a deployment and a service exist for a given worker name +// note that the deployment name will be a unique, auto-generated name +func verifyDeploymentAndServiceForWorkerName(t *testing.T, resizer *MJSResizer, name string) { + deps, err := resizer.client.GetDeploymentsWithLabel(fmt.Sprintf("workerName=%s", name)) + assert.NoError(t, err, "error getting deployments") + require.Equalf(t, 1, len(deps.Items), "expected 1 deployment for worker name %s", name) + + svcs, err := resizer.client.GetServicesWithLabel(fmt.Sprintf("workerName=%s", name)) + assert.NoError(t, err, "error getting service") + require.Equalf(t, 1, len(svcs.Items), "expected 1 service for worker name %s", name) +} + +// verify that a K8s cluster has a given number of deployments with a given label selector +func verifyNumDeployments(t *testing.T, resizer *MJSResizer, n int, labelSelector string) { + deps, err := resizer.client.GetDeploymentsWithLabel(labelSelector) + assert.NoError(t, err, "error getting deployments") + require.Equal(t, n, len(deps.Items), "did not find expected number of deployments") +} + +// verify that a K8s cluster has a given number of services with a given label selector +func verifyNumServices(t *testing.T, resizer *MJSResizer, n int, labelSelector string) { + svcs, err := resizer.client.GetServicesWithLabel(labelSelector) + assert.NoError(t, err, "error getting services") + require.Equal(t, n, len(svcs.Items), "did not find expected number of services") +} + +// Test the extraction of worker info from a list of deployments +func TestGetWorkersFromDeployments(t *testing.T) { + // Create list of workers + workers := []Worker{ + { + Info: specs.WorkerInfo{ + Name: "runningWorker", + ID: 1, + HostName: "worker1-uid", + }, + IsRunning: true, + }, + { + Info: specs.WorkerInfo{ + Name: "pendingWorker", + ID: 2, + HostName: "worker2-uid", + }, + IsRunning: false, + }, + } + + // Create deployments for these workers + specFactory := specs.NewSpecFactory(&config.Config{}, "abcd") + depList := appsv1.DeploymentList{} + for _, w := range workers { + deployment := specFactory.GetWorkerDeploymentSpec(&w.Info) + if w.IsRunning { + deployment.Status.ReadyReplicas = 1 + } + depList.Items = append(depList.Items, *deployment) + } + + gotWorkers := getWorkersFromDeployments(&depList, logging.NewFromZapLogger(zaptest.NewLogger(t))) + assert.Equal(t, workers, gotWorkers, "Incorrect worker list returned by GetWorkersFromDeployments") +} + +func TestGetProxiesFromDeployments(t *testing.T) { + // Create list of proxies + proxies := []specs.PoolProxyInfo{ + { + ID: 1, + Name: "proxy1", + Port: 30000, + }, { + ID: 4, + Name: "proxy4", + Port: 30005, + }, { + ID: 6, + Name: "proxy6", + Port: 30009, + }, + } + + // Create deployments for these proxies + specFactory := specs.NewSpecFactory(&config.Config{}, "abcd") + depList := appsv1.DeploymentList{} + for _, p := range proxies { + depList.Items = append(depList.Items, *specFactory.GetPoolProxyDeploymentSpec(&p)) + } + + gotProxies := getProxiesFromDeployments(&depList, logging.NewFromZapLogger(zaptest.NewLogger(t))) + assert.Equal(t, proxies, gotProxies, "Incorrect proxies list returned by GetProxiesFromDeployments") +} + +// Verify that we don't get drooled resources when adding a worker fails +func TestAddWorkerNegative(t *testing.T) { + t.Run("deployment_fails", func(tt *testing.T) { + resizer, fakeK8s := getResizerWithFakeClient(t) + setupDeploymentFailure(fakeK8s, "", "") + w := specs.WorkerInfo{Name: "test", ID: 10} + err := resizer.addWorker(&w) + assert.Error(tt, err, "Should get error when worker deployment creation fails") + verifyNumWorkersInK8s(tt, resizer, 0) + }) + + t.Run("service_fails", func(tt *testing.T) { + resizer, client := getResizerWithFakeClient(t) + setupServiceFailure(client, "", "") + w := specs.WorkerInfo{Name: "test", ID: 10} + err := resizer.addWorker(&w) + assert.Error(tt, err, "Should get error when worker service creation fails") + verifyNumWorkersInK8s(tt, resizer, 0) + }) +} + +// Verify that we don't get drooled resources when adding a proxy fails +func TestAddProxyNegative(t *testing.T) { + resizer, client := getResizerWithFakeClient(t) + setupDeploymentFailure(client, "", "") + p := specs.PoolProxyInfo{Name: "test", ID: 3, Port: 2000} + err := resizer.addPoolProxy(p) + assert.Error(t, err, "Should get error when proxy deployment creation fails") + verifyNumProxiesInK8s(t, resizer, 0) +} + +// Verify that workers associated with a proxy do not get started if we fail to add that proxy +func TestAddWorkersProxyFailure(t *testing.T) { + conf := config.Config{ + WorkersPerPoolProxy: 4, + } + resizer, client := getResizerWithFakeClientAndConfig(t, &conf) + + // Add some pre-existing workers + initWorkers := []specs.WorkerInfo{ + {Name: "worker1", ID: 1}, + {Name: "worker2", ID: 2}, + } + err := resizer.addWorkersAndProxies(initWorkers) + assert.NoError(t, err) + verifyNumWorkersInK8s(t, resizer, len(initWorkers)) + verifyNumProxiesInK8s(t, resizer, 1) + + // Add some more workers, but this time error when creating proxy #2 + // All workers associated with that proxy should not be created, whereas all other workers should be successfully created + setupDeploymentFailure(client, specs.PoolProxyLabels.ID, "2") + toAddExistingProxy := []specs.WorkerInfo{ // Workers that use the already-existing proxy #1 + {Name: "worker3", ID: 3}, + {Name: "worker4", ID: 4}, + } + toAddProxyError := []specs.WorkerInfo{ // Workers that will trigger creation of proxy #2, which fails + {Name: "worker5", ID: 5}, + {Name: "worker6", ID: 6}, + } + toAddNoProxyError := []specs.WorkerInfo{ // Workers that will trigger creation of proxy #3, which succeeds + {Name: "worker10", ID: 10}, + } + toAddSuccess := append(toAddExistingProxy, toAddNoProxyError...) + toAdd := append(toAddSuccess, toAddProxyError...) + err = resizer.addWorkersAndProxies(toAdd) + assert.NoError(t, err) // Note we don't expect an error, since some parts of the creation succeeded + + // Check the cluster only contains the original workers + the workers that did not need a new proxy + verifyNumProxiesInK8s(t, resizer, 2) + verifyNumWorkersInK8s(t, resizer, len(initWorkers)+len(toAddSuccess)) + for _, w := range toAddSuccess { + verifyDeploymentAndServiceForWorkerName(t, resizer, w.Name) + } +} + +// Verify that a proxy is torn down if we fail to add any associated workers +func TestAddProxyWorkersAllFail(t *testing.T) { + conf := config.Config{ + WorkersPerPoolProxy: 2, + } + resizer, client := getResizerWithFakeClientAndConfig(t, &conf) + setupDeploymentFailure(client, specs.AppKey, specs.WorkerLabels.AppLabel) + toAdd := []specs.WorkerInfo{ + {Name: "worker1", ID: 1}, + {Name: "worker2", ID: 2}, + } + err := resizer.addWorkersAndProxies(toAdd) + assert.NoError(t, err) + verifyNumWorkersInK8s(t, resizer, 0) + verifyNumProxiesInK8s(t, resizer, 0) +} + +// Return a fake Kubernetes client that errors when trying to create a deployment, optionally only for deployments with a given label +func setupDeploymentFailure(fakeK8s *fake.Clientset, labelKey, labelVal string) { + useCustomLabel := labelKey != "" + fakeK8s.PrependReactor("create", "deployments", func(action k8sTesting.Action) (bool, runtime.Object, error) { + if useCustomLabel { + createAction := action.(k8sTesting.CreateAction) + dep := createAction.GetObject().(*appsv1.Deployment) + hasLabel := dep.Labels[labelKey] == labelVal + if hasLabel { + return true, nil, fmt.Errorf("failed to create deployment with label %s=%s", labelKey, labelVal) + } + return false, nil, nil // Let the action progress to the fake clientset + } + return true, nil, errors.New("failed to create a deployment") + }) +} + +// Return a fake Kubernetes client that errors when trying to create a service, optionally only for services with a given label +func setupServiceFailure(client *fake.Clientset, labelKey, labelVal string) { + useCustomLabel := labelKey != "" + client.PrependReactor("create", "services", func(action k8sTesting.Action) (bool, runtime.Object, error) { + if useCustomLabel { + createAction := action.(k8sTesting.CreateAction) + svc := createAction.GetObject().(*corev1.Service) + hasLabel := svc.Labels[labelKey] == labelVal + if hasLabel { + return true, nil, fmt.Errorf("failed to create service with label %s=%s", labelKey, labelVal) + } + return false, nil, nil // Let the action progress to the fake clientset + } + return true, nil, errors.New("failed to create a service") + }) +} diff --git a/controller/src/internal/specs/specs.go b/controller/src/internal/specs/specs.go new file mode 100644 index 0000000..2332d95 --- /dev/null +++ b/controller/src/internal/specs/specs.go @@ -0,0 +1,644 @@ +// Package specs contains functions for creating Kubernetes resource specs +// Copyright 2024 The MathWorks, Inc. +package specs + +import ( + "controller/internal/config" + "fmt" + "math" + "path/filepath" + "strings" + + "github.com/google/uuid" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" +) + +// SpecFactory generates Kubernetes resource specs +type SpecFactory struct { + config *config.Config + ownerRefs []metav1.OwnerReference +} + +// Volume names +const ( + matlabVolumeName = "matlab-volume" + logVolumeName = "log-volume" + secretVolumeName = "secret-volume" + mjsDefVolumeName = "mjs-volume" + checkpointVolumeName = "checkpoint-volume" + proxyCertVolumeName = "proxy-cert-volume" +) + +// Secret names +const ( + SharedSecretName = "mjs-shared-secret" + AdminPasswordSecretName = "mjs-admin-password" + AdminPasswordKey = "password" +) + +// File names +const ( + ProxyCertFileName = "certificate.json" + proxyCertDir = "/proxy-cert" +) + +// NewSpecFactory constructs a SpecFactory +func NewSpecFactory(conf *config.Config, ownerUID types.UID) *SpecFactory { + // Store owner reference for all created resources + ownerRefs := []metav1.OwnerReference{} + if !conf.LocalDebugMode { + ownerRefs = []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: conf.DeploymentName, + UID: ownerUID, + }, + } + } + return &SpecFactory{ + config: conf, + ownerRefs: ownerRefs, + } +} + +// GetWorkerDeploymentSpec creates a spec for a deployment running an MJS worker pod +func (s *SpecFactory) GetWorkerDeploymentSpec(w *WorkerInfo) *appsv1.Deployment { + // Create container to run a worker + workerScript := filepath.Join(s.config.MJSDefDir, "worker.sh") + container := corev1.Container{ + Name: "mjs-worker", + Image: s.config.MatlabImage, + ImagePullPolicy: corev1.PullPolicy(s.config.MatlabImagePullPolicy), + Command: []string{"/bin/sh"}, + Args: []string{workerScript}, + } + addResourceRequests(&container, s.config.WorkerCPURequest, s.config.WorkerMemoryRequest) + addResourceLimits(&container, s.config.WorkerCPULimit, s.config.WorkerMemoryLimit) + + // Add environment variables + minMPIPort, maxMPIPort := s.getMPIPorts() + workerEnv := map[string]string{ + "MJS_WORKER_USERNAME": s.config.WorkerUsername, + "MJS_WORKER_PASSWORD": s.config.WorkerPassword, + "USER": s.config.WorkerUsername, + "MLM_LICENSE_FILE": s.config.NetworkLicenseManager, + "SHELL": "/bin/sh", + "WORKER_NAME": w.Name, + "HOSTNAME": w.HostName, + "MDCE_OVERRIDE_INTERNAL_HOSTNAME": w.HostName, + "MPICH_PORT_RANGE": fmt.Sprintf("%d:%d", minMPIPort, maxMPIPort), + } + proxyID := s.CalculatePoolProxyForWorker(w.ID) + proxy := NewPoolProxyInfo(proxyID, s.config.PoolProxyBasePort) + if s.config.UsePoolProxy() { + workerEnv["PARALLEL_SERVER_POOL_PROXY_HOST"] = proxy.Name + workerEnv["PARALLEL_SERVER_POOL_PROXY_PORT"] = fmt.Sprintf("%d", proxy.Port) + workerEnv["PARALLEL_SERVER_POOL_PROXY_EXTERNAL_HOST"] = "$CLIENT_OVERRIDE" + if s.config.UseSecureCommunication { + workerEnv["PARALLEL_SERVER_POOL_PROXY_CERTIFICATE"] = filepath.Join(proxyCertDir, ProxyCertFileName) + } + } else { + workerEnv["MDCE_OVERRIDE_EXTERNAL_HOSTNAME"] = s.GetServiceHostname(w.HostName) // Hostname for clients inside the K8s cluster + } + addEnv(&container, workerEnv) + + // Add pre-stop hook to stop the worker cleanly + stopWorkerPath := filepath.Join(s.config.MJSDefDir, "stopWorker.sh") + container.Lifecycle = &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/bash", stopWorkerPath}, + }, + }, + } + + // Create pod spec + var rootUserID int64 = 0 //nolint + pod := corev1.PodSpec{ + Containers: []corev1.Container{container}, + TerminationGracePeriodSeconds: &s.config.StopWorkerGracePeriod, + Hostname: w.HostName, + + // The MJS process must run as root in order to start MATLAB processes as another user + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: &rootUserID, + }, + } + s.setEnableServiceLinks(&pod) + + // Add volumes + addVolumeFromConfigMap(&pod, s.config.MJSDefConfigMap, mjsDefVolumeName, s.config.MJSDefDir) + if s.config.MatlabPVC != "" { + addVolumeFromPVC(&pod, s.config.MatlabPVC, matlabVolumeName, s.config.MatlabRoot, true) + } + if s.config.WorkerLogPVC != "" { + addVolumeFromPVC(&pod, s.config.WorkerLogPVC, logVolumeName, s.config.LogBase, false) + } + if s.config.RequiresSecret() { + addVolumeFromSecret(&pod, SharedSecretName, secretVolumeName, s.config.SecretDir, true) + } + if s.config.UsePoolProxy() && s.config.UseSecureCommunication { + addVolumeFromSecret(&pod, proxy.Name, proxyCertVolumeName, proxyCertDir, false) + } + + return s.wrapPod(&pod, w.HostName, getLabelsForWorker(w)) +} + +// GetWorkerServiceSpec creates a spec for an internal Kubernetes service that points to an MJS worker pod; this service is used by other pods to communicate with the worker pod +func (s *SpecFactory) GetWorkerServiceSpec(w *WorkerInfo) *corev1.Service { + // All workers need to expose their MJS ports to allow the job manager to connect + workerPorts := []int{} + for p := 0; p < 10; p++ { + workerPorts = append(workerPorts, s.config.BasePort+p) + } + + // All workers must expose their MPI ports + minMPIPort, maxMPIPort := s.getMPIPorts() + for p := minMPIPort; p <= maxMPIPort; p++ { + workerPorts = append(workerPorts, p) + } + + // Add parpool ports for this worker + minPort, maxPort := s.CalculateWorkerPorts() + for p := minPort; p <= maxPort; p++ { + workerPorts = append(workerPorts, p) + } + + // Create the service spec + svc := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: w.HostName, + Labels: getLabelsForWorker(w), + OwnerReferences: s.ownerRefs, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + Selector: getLabelsForWorker(w), + }, + } + AddPorts(&svc, workerPorts) + return &svc +} + +// GetPoolProxyDeploymentSpec creates a spec for a deployment to run a parallel pool proxy pod +func (s *SpecFactory) GetPoolProxyDeploymentSpec(proxy *PoolProxyInfo) *appsv1.Deployment { + // Input arguments for the pool proxy + logfile := filepath.Join(s.config.LogBase, fmt.Sprintf("%s.log", proxy.Name)) + proxyArgs := []string{ + "--loglevel", + fmt.Sprintf("%d", s.config.LogLevel), + "--port", + fmt.Sprintf("%d", proxy.Port), + "--logfile", + logfile, + } + if s.config.UseSecureCommunication { + proxyArgs = append(proxyArgs, "--certificate", filepath.Join(proxyCertDir, ProxyCertFileName)) + } + + // Create container to run a proxy + container := corev1.Container{ + Name: "pool-proxy", + Image: s.config.PoolProxyImage, + ImagePullPolicy: corev1.PullPolicy(s.config.PoolProxyImagePullPolicy), + Args: proxyArgs, + } + addResourceRequests(&container, s.config.PoolProxyCPURequest, s.config.PoolProxyMemoryRequest) + addResourceLimits(&container, s.config.PoolProxyCPULimit, s.config.PoolProxyMemoryLimit) + + // Create pod spec + var proxyGracePeriod int64 = 5 + pod := corev1.PodSpec{ + Containers: []corev1.Container{container}, + TerminationGracePeriodSeconds: &proxyGracePeriod, + } + s.setEnableServiceLinks(&pod) + + // Add volumes + if s.config.MatlabPVC != "" { + addVolumeFromPVC(&pod, s.config.MatlabPVC, matlabVolumeName, s.config.MatlabRoot, true) + } + if s.config.WorkerLogPVC != "" { + addVolumeFromPVC(&pod, s.config.WorkerLogPVC, logVolumeName, s.config.LogBase, false) + } + if s.config.UseSecureCommunication { + addVolumeFromSecret(&pod, proxy.Name, proxyCertVolumeName, proxyCertDir, true) + } + + labels := map[string]string{ + AppKey: PoolProxyLabels.AppLabel, // Common label for all pool proxy pods to allow them to be easily listed + PoolProxyLabels.Name: proxy.Name, + PoolProxyLabels.ID: fmt.Sprintf("%d", proxy.ID), + PoolProxyLabels.Port: fmt.Sprintf("%d", proxy.Port), + } + return s.wrapPod(&pod, proxy.Name, labels) +} + +const ( + JobManagerUIDKey = "job-manager-uid" + JobManagerHostname = "mjs-job-manager" + AppKey = "app" +) + +// GetJobManagerDeploymentSpec creates a spec for a deployment to run the MJS job manager pod +func (s *SpecFactory) GetJobManagerDeploymentSpec() *appsv1.Deployment { + // Create container to run the MJS job manager + jobManagerScript := filepath.Join(s.config.MJSDefDir, "jobManager.sh") + container := corev1.Container{ + Name: JobManagerHostname, + Image: s.config.MatlabImage, + ImagePullPolicy: corev1.PullPolicy(s.config.MatlabImagePullPolicy), + Command: []string{"/bin/sh", jobManagerScript}, + } + addResourceRequests(&container, s.config.JobManagerCPURequest, s.config.JobManagerMemoryRequest) + addResourceLimits(&container, s.config.JobManagerCPULimit, s.config.JobManagerMemoryLimit) + + // Include admin password if using Security Level >= 2 + if s.config.SecurityLevel >= 2 { + addEnvFromSecret(&container, "PARALLEL_SERVER_JOBMANAGER_ADMIN_PASSWORD", AdminPasswordSecretName, AdminPasswordKey) + } + + // Add startup and liveness probes + binDir := filepath.Join(s.config.MatlabRoot, "toolbox", "parallel", "bin") + healthcheckPath := filepath.Join(binDir, "glnxa64", "mjshealthcheck") + probeHandler := &corev1.ProbeHandler{Exec: &corev1.ExecAction{ + Command: []string{healthcheckPath, + "-jobmanager", + s.config.JobManagerName, + "-matlabroot", + s.config.MatlabRoot, + "-baseport", + fmt.Sprintf("%d", s.config.BasePort), + "-timeout", + fmt.Sprintf("%d", s.config.LivenessProbeTimeout), + }, + }} + container.StartupProbe = &corev1.Probe{ + InitialDelaySeconds: s.config.StartupProbeInitialDelay, + PeriodSeconds: s.config.StartupProbePeriod, + TimeoutSeconds: s.config.LivenessProbeTimeout, // Use same timeout as liveness probe, since the probe handler is the same + FailureThreshold: s.config.StartupProbeFailureThreshold, + ProbeHandler: *probeHandler, + } + container.LivenessProbe = &corev1.Probe{ + InitialDelaySeconds: s.config.LivenessProbePeriod, + PeriodSeconds: s.config.LivenessProbePeriod, + TimeoutSeconds: s.config.LivenessProbeTimeout, + FailureThreshold: s.config.LivenessProbeFailureThreshold, + ProbeHandler: *probeHandler, + } + + // Add pre-stop hook to stop the job manager cleanly + stopJobManagerCmd := []string{ + filepath.Join(binDir, "stopjobmanager"), + "-name", + s.config.JobManagerName, + "-cleanPreserveJobs", + } + if s.config.RequireScriptVerification { + stopJobManagerCmd = append(stopJobManagerCmd, "-secretfile", filepath.Join(s.config.SecretDir, s.config.SecretFileName)) + } + container.Lifecycle = &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: stopJobManagerCmd, + }, + }, + } + // Create pod spec + pod := corev1.PodSpec{ + Containers: []corev1.Container{container}, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: &s.config.JobManagerUserID, + RunAsGroup: &s.config.JobManagerGroupID, + FSGroup: &s.config.JobManagerGroupID, + }, + } + s.setEnableServiceLinks(&pod) + + // Add volumes + addVolumeFromConfigMap(&pod, s.config.MJSDefConfigMap, mjsDefVolumeName, s.config.MJSDefDir) + if s.config.MatlabPVC != "" { + addVolumeFromPVC(&pod, s.config.MatlabPVC, matlabVolumeName, s.config.MatlabRoot, true) + } + if s.config.LogPVC != "" { + addVolumeFromPVC(&pod, s.config.LogPVC, logVolumeName, s.config.LogBase, false) + } + if s.config.CheckpointPVC != "" { + addVolumeFromPVC(&pod, s.config.CheckpointPVC, checkpointVolumeName, s.config.CheckpointBase, false) + } + if s.config.RequiresSecret() { + addVolumeFromSecret(&pod, SharedSecretName, secretVolumeName, s.config.SecretDir, true) + } + + // Mount custom phoenix config file + if s.config.OverrideWrapperPhoenix { + const wrapperPhoenixFile = "wrapper-phoenix.config" + configPath := filepath.Join(s.config.MatlabRoot, "toolbox", "parallel", "config", wrapperPhoenixFile) + addVolumeFromConfigMapFile(&pod, "mjs-phoenix-config", "phoenix-volume", wrapperPhoenixFile, configPath) + } + + // Ensure this pod can resolve itself via its service name without having to use the Kubernetes service; this ensures it can resolve its own MJS service even if the Kubernetes service does not map to this pod + pod.HostAliases = []corev1.HostAlias{{ + IP: "127.0.0.1", + Hostnames: []string{JobManagerHostname}, + }} + + return s.wrapPod(&pod, JobManagerHostname, s.getLabelsForJobManager()) +} + +// GetSecretSpec creates a spec for an empty Kubernetes secret +func (s *SpecFactory) GetSecretSpec(name string) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + OwnerReferences: s.ownerRefs, + }, + Data: map[string][]byte{}, + } +} + +// CalculateWorkerPorts returns the min and max port to use for parpools on worker pods +func (s *SpecFactory) CalculateWorkerPorts() (int, int) { + minPort := s.config.BasePort + 10 + maxPort := minPort + s.config.PortsPerWorker - 1 + return minPort, maxPort +} + +// CalculateProxyForWorker calculates the proxy that a worker should use based on its ID +func (s *SpecFactory) CalculatePoolProxyForWorker(workerID int) int { + ratio := float64(workerID) / float64(s.config.WorkersPerPoolProxy) + return int(math.Ceil(ratio)) +} + +// PoolProxyInfo is a struct containing proxy metadata +type PoolProxyInfo struct { + ID int // Proxy ID; used to relate workers to proxies + Name string // Proxy name; this is also the host name + Port int // Proxy port +} + +// NewPoolProxyInfo creates a PoolProxyInfo struct for a given proxy ID +func NewPoolProxyInfo(id int, basePort int) PoolProxyInfo { + return PoolProxyInfo{ + ID: id, + Name: fmt.Sprintf("mjs-pool-proxy-%d", id), + Port: basePort + id - 1, // note we are using 1-based indexing + } +} + +// addVolumeFromPVC adds a volume mounted from a PersistentVolumeClaim to a pod spec +func addVolumeFromPVC(pod *corev1.PodSpec, pvcName, volumeName, mountPath string, readOnly bool) { + source := corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + } + addVolume(pod, source, volumeName, mountPath, readOnly) +} + +// addVolumeFromConfigMap adds a volume mounted from a ConfigMap to a pod spec +func addVolumeFromConfigMap(pod *corev1.PodSpec, configMapName, volumeName, mountPath string) { + source := corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: configMapName, + }, + }, + } + addVolume(pod, source, volumeName, mountPath, true) +} + +// addVolumeFromConfigMapFile adds a volume mounted from a ConfigMap file to a single file on the pod +func addVolumeFromConfigMapFile(pod *corev1.PodSpec, configMapName, volumeName, fileName, mountPath string) { + source := corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: configMapName, + }, + Items: []corev1.KeyToPath{ + { + Key: fileName, + Path: fileName, + }, + }, + }, + } + pod.Containers[0].VolumeMounts = append(pod.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + MountPath: mountPath, + SubPath: fileName, + ReadOnly: true, + }) + pod.Volumes = append(pod.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: source, + }) +} + +// addVolumeFromSecret adds a volume mounted from a Kubernetes Secret to a pod spec +func addVolumeFromSecret(pod *corev1.PodSpec, secretName, volumeName, mountPath string, restrictReadAccess bool) { + source := corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: secretName, + }, + } + if restrictReadAccess { + var accessMode int32 = 400 // only readable by owner + source.Secret.DefaultMode = &accessMode + } + addVolume(pod, source, volumeName, mountPath, true) +} + +// addVolume adds a generic volume source to a pod spec +func addVolume(pod *corev1.PodSpec, source corev1.VolumeSource, volumeName, mountPath string, readOnly bool) { + pod.Containers[0].VolumeMounts = append(pod.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + MountPath: mountPath, + ReadOnly: readOnly, + }) + pod.Volumes = append(pod.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: source, + }) +} + +// addEnv appends environment variables from a map to a container +func addEnv(container *corev1.Container, toAdd map[string]string) { + env := container.Env + for key, val := range toAdd { + env = append(env, corev1.EnvVar{Name: key, Value: val}) + } + container.Env = env +} + +// addEnvFromSecret appends an environment variable from a secret to a container +func addEnvFromSecret(container *corev1.Container, varName, secretName, secretKey string) { + container.Env = append(container.Env, corev1.EnvVar{ + Name: varName, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: secretName, + }, + Key: secretKey, + }, + }, + }) +} + +// addResourceLimits adds resource limits to a container +func addResourceLimits(container *corev1.Container, cpu, memory string) { + container.Resources.Limits = getResourceList(cpu, memory) +} + +// addResourceRequests adds resource requests to a container +func addResourceRequests(container *corev1.Container, cpu, memory string) { + container.Resources.Requests = getResourceList(cpu, memory) +} + +// getResourceList converts CPU and memory strings to a pod ResourceList +func getResourceList(cpu, memory string) corev1.ResourceList { + resourceList := corev1.ResourceList{} + if cpu != "" { + resourceList[corev1.ResourceCPU] = resource.MustParse(cpu) + } + if memory != "" { + resourceList[corev1.ResourceMemory] = resource.MustParse(memory) + } + return resourceList +} + +// AddPorts appends ports to a service spec +func AddPorts(svc *corev1.Service, toAdd []int) { + ports := svc.Spec.Ports + existingPorts := map[int]bool{} + for _, p := range ports { + existingPorts[int(p.Port)] = true + } + for _, p := range toAdd { + if existingPorts[p] { + continue + } + ports = append(ports, corev1.ServicePort{ + Name: fmt.Sprintf("tcp-%d", p), + Port: int32(p), + TargetPort: intstr.FromInt(p), + Protocol: "TCP", + }) + } + svc.Spec.Ports = ports +} + +// WorkerLabels defines labels to apply to worker pods +var WorkerLabels = struct { + Name string + ID string + HostName string + AppLabel string +}{ + Name: "workerName", + ID: "workerID", + HostName: "hostName", + AppLabel: "mjs-worker", +} + +// getLabelsForWorker returns a map of labels to apply to a worker resource +func getLabelsForWorker(w *WorkerInfo) map[string]string { + return map[string]string{ + AppKey: WorkerLabels.AppLabel, // Common label for all worker pods to allow them to be easily listed + WorkerLabels.Name: w.Name, + WorkerLabels.ID: fmt.Sprintf("%d", w.ID), + WorkerLabels.HostName: w.HostName, + } +} + +// getLabelsForJobManager returns a map of labels to apply to a job manager pod or service +func (s *SpecFactory) getLabelsForJobManager() map[string]string { + return map[string]string{ + AppKey: JobManagerHostname, + JobManagerUIDKey: s.config.JobManagerUID, + } +} + +// PoolProxyLabels defines labels to apply to pool proxy pods +var PoolProxyLabels = struct { + Name string + ID string + Port string + AppLabel string +}{ + Name: "proxyName", + ID: "proxyID", + Port: "port", + AppLabel: "mjs-pool-proxy", +} + +// wrapPod wraps a pod spec into a deployment spec +func (s *SpecFactory) wrapPod(pod *corev1.PodSpec, name string, labels map[string]string) *appsv1.Deployment { + var numReplicas int32 = 1 // Always want only one pod per deployment + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: labels, + OwnerReferences: s.ownerRefs, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &numReplicas, + Selector: &metav1.LabelSelector{ + MatchLabels: labels, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + }, + Spec: *pod, + }, + }, + } +} + +// WorkerInfo is a struct containing a worker's name, ID and and unique host name +type WorkerInfo struct { + Name string + ID int + HostName string +} + +// GenerateUniqueHostName generates a unique host name for a worker +func (w *WorkerInfo) GenerateUniqueHostName() { + guidWithHyphens := uuid.New() + guidNoHyphens := strings.Replace(guidWithHyphens.String(), "-", "", -1) + uniqueHostName := fmt.Sprintf("%s-%s", w.Name, guidNoHyphens) + maxLen := 64 + if len(uniqueHostName) > maxLen { + uniqueHostName = uniqueHostName[:maxLen] + } + w.HostName = uniqueHostName +} + +// Number of MPI ports to open on each worker pod +const mpiPortsPerWorker = 10 + +// Get the minimum and maximum MPI ports that workers should use +func (s *SpecFactory) getMPIPorts() (int, int) { + minPort := s.config.BasePort + 1000 + maxPort := minPort + mpiPortsPerWorker - 1 + return minPort, maxPort +} + +// Set enableServiceLinks; setting this to false to prevents large numbers of environment variables being created for pods in large clusters +func (s *SpecFactory) setEnableServiceLinks(pod *corev1.PodSpec) { + pod.EnableServiceLinks = &s.config.EnableServiceLinks +} + +// Convert a service hostname to a hostname that can be resolved by pods in other namespaces +func (s *SpecFactory) GetServiceHostname(svcName string) string { + return fmt.Sprintf("%s.%s.svc.cluster.local", svcName, s.config.Namespace) +} diff --git a/controller/src/internal/specs/specs_test.go b/controller/src/internal/specs/specs_test.go new file mode 100644 index 0000000..13029f7 --- /dev/null +++ b/controller/src/internal/specs/specs_test.go @@ -0,0 +1,465 @@ +// Copyright 2024 The MathWorks, Inc. +package specs + +import ( + "controller/internal/config" + "fmt" + "path/filepath" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" +) + +// Test the creation and use of a SpecFactory +func TestGetWorkerDeploymentSpec(t *testing.T) { + testCases := []struct { + name string + usePoolProxy bool + useSecureCommunication bool + }{ + {"no_proxy_no_secure", false, false}, + {"proxy_no_secure", true, false}, + {"no_proxy_secure", false, true}, + {"proxy_secure", true, true}, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ownerUID := types.UID("abcd1234") + conf := createTestConfig() + conf.InternalClientsOnly = !tc.usePoolProxy + conf.UseSecureCommunication = tc.useSecureCommunication + + // Create a SpecFactory + specFactory := NewSpecFactory(conf, ownerUID) + assert.NotNil(t, specFactory) + assert.Equal(t, conf, specFactory.config) + + // Verify the specs it creates + verifyWorkerSpecs(t, specFactory, conf, ownerUID) + verifyProxySpecs(t, specFactory, conf, ownerUID) + }) + } +} + +// Test the calculate of which proxy to use for each worker +func TestCalculateProxyForWorker(t *testing.T) { + conf := &config.Config{ + WorkersPerPoolProxy: 10, + } + specFactory := NewSpecFactory(conf, "abcd") + assert.Equal(t, 1, specFactory.CalculatePoolProxyForWorker(1)) + assert.Equal(t, 1, specFactory.CalculatePoolProxyForWorker(10)) + assert.Equal(t, 2, specFactory.CalculatePoolProxyForWorker(11)) +} + +// Test generation of unique hostnames for workers +func TestGenerateUniqueHostname(t *testing.T) { + assert := assert.New(t) + + worker1 := WorkerInfo{ + Name: "myworker", + ID: 10, + } + assert.Empty(worker1.HostName, "Worker hostname should initially be empty") + + worker1.GenerateUniqueHostName() + assert.NotEmpty(worker1.HostName, "Worker hostname should be filled after GenerateUniqueHostName") + assert.Contains(worker1.HostName, worker1.Name, "Worker hostname should contain worker name") + + // Check that a second worker with the same name gets a unique host name + worker2 := WorkerInfo{ + Name: worker1.Name, + ID: worker1.ID, + } + worker2.GenerateUniqueHostName() + assert.NotEmpty(worker2.HostName, "Worker hostname should be filled after GenerateUniqueHostName") + assert.NotEqual(worker1.HostName, worker2.HostName, "Worker hostnames should be unique") +} + +// Check that all of the allowed MPI ports are exposed by the worker service (g3221764) +func TestMPIPorts(t *testing.T) { + ownerUID := types.UID("abcd1234") + conf := createTestConfig() + conf.BasePort = 30000 + specFactory := NewSpecFactory(conf, ownerUID) + + worker := WorkerInfo{ + Name: "worker", + ID: 5, + HostName: "test", + } + deployment := specFactory.GetWorkerDeploymentSpec(&worker) + service := specFactory.GetWorkerServiceSpec(&worker) + + // Check that the MPI port range environment variable is set on the worker + expectedEnvVar := "MPICH_PORT_RANGE" + gotEnv := deployment.Spec.Template.Spec.Containers[0].Env + found := false + var gotVal string + for _, e := range gotEnv { + if e.Name == expectedEnvVar { + found = true + gotVal = e.Value + break + } + } + assert.Truef(t, found, "Worker environment variable %s should be set", expectedEnvVar) + + // Extract the ports from the environment variable + ports := strings.Split(gotVal, ":") + assert.Lenf(t, ports, 2, "%s should have format minPort:maxPort", expectedEnvVar) + minPort, err := strconv.Atoi(ports[0]) + assert.NoErrorf(t, err, "Failed to convert min MPI port '%s' to an int", ports[0]) + maxPort, err := strconv.Atoi(ports[1]) + assert.NoErrorf(t, err, "Failed to convert max MPI port '%s' to an int", ports[1]) + + // Check the port numbers + assert.Equal(t, conf.BasePort+1000, minPort, "Unexpected minimum MPI port") + assert.Equal(t, minPort+mpiPortsPerWorker-1, maxPort, "Unexpected maximum MPI port") + + // Check that each port was exposed on the worker service + exposedPorts := map[int]bool{} + for _, p := range service.Spec.Ports { + exposedPorts[int(p.Port)] = true + } + for p := minPort; p <= maxPort; p++ { + assert.Contains(t, exposedPorts, minPort, "MPI port should be exposed by the worker service") + } + + // Ensure the pod's hostname matches the service name + assert.Equal(t, service.Name, deployment.Spec.Template.Spec.Hostname, "Worker pod hostname must match the service name in order for MPI to work") +} + +func TestGetJobManagerSpecs(t *testing.T) { + testCases := []struct { + name string + useSecureCommunication bool + }{ + {"insecure", false}, + {"secure_communication", true}, + } + for _, tc := range testCases { + t.Run(tc.name, func(tt *testing.T) { + ownerUID := types.UID("abcd1234") + conf := createTestConfig() + conf.UseSecureCommunication = tc.useSecureCommunication + conf.JobManagerUID = "jm123" + + // Create a SpecFactory + specFactory := NewSpecFactory(conf, ownerUID) + assert.NotNil(tt, specFactory) + assert.Equal(tt, conf, specFactory.config) + + // Verify the job manager spec it creates + verifyJobManagerSpec(tt, specFactory, conf, ownerUID) + }) + } +} + +func verifyWorkerSpecs(t *testing.T, specFactory *SpecFactory, conf *config.Config, ownerUID types.UID) { + assert := assert.New(t) + testWorker := &WorkerInfo{ + Name: "worker1", + ID: 3, + HostName: "my-worker-host", + } + + // Create a worker deployment + deployment := specFactory.GetWorkerDeploymentSpec(testWorker) + assert.NotNil(deployment) + + // Check basic properties + verifyDeployment(t, deployment, ownerUID) + assert.Equalf(WorkerLabels.AppLabel, deployment.Labels[AppKey], "Missing deployment label") + assert.Equal(testWorker.HostName, deployment.ObjectMeta.Name, "Deployment name should match worker hostname") + + // Check the underlying pod spec + pod := deployment.Spec.Template.Spec + assert.Len(pod.Containers, 1, "Worker pod should have 1 container") + container := pod.Containers[0] + assert.Equal(conf.MatlabImage, container.Image, "Worker container has incorrect image") + assert.Equal(conf.MatlabImagePullPolicy, string(container.ImagePullPolicy), "Worker container has incorrect image pull policy") + assert.Equal(conf.WorkerCPULimit, container.Resources.Limits.Cpu().String(), "Worker container has incorrect CPU limit") + assert.Equal(conf.WorkerCPURequest, container.Resources.Requests.Cpu().String(), "Worker container has incorrect CPU request") + assert.Equal(conf.WorkerMemoryLimit, container.Resources.Limits.Memory().String(), "Worker container has incorrect memory limit") + assert.Equal(conf.WorkerMemoryRequest, container.Resources.Requests.Memory().String(), "Worker container has incorrect memory request") + assert.Contains(container.Args[0], conf.MJSDefDir, "Worker container arg should point to worker script under the mjs_def mount directory") + assert.NotNil(container.Lifecycle.PreStop, "Worker container should have pre-stop hook set") + assert.Equal(int64(0), *pod.SecurityContext.RunAsUser, "Worker pod should run as root") + verifyEnableServiceLinksFalse(t, &pod) + + // Check volumes + verifyPodHasVolume(t, &pod, mjsDefVolumeName) + if conf.UseSecureCommunication { + verifyPodHasVolume(t, &pod, secretVolumeName) + } else { + verifyPodDoesNotHaveVolume(t, &pod, secretVolumeName) + } + if conf.UsePoolProxy() && conf.UseSecureCommunication { + verifyPodHasVolume(t, &pod, proxyCertVolumeName) + } else { + verifyPodDoesNotHaveVolume(t, &pod, proxyCertVolumeName) + } + + // Check environment + verifyEnvVar(t, &pod, "MLM_LICENSE_FILE", conf.NetworkLicenseManager) + verifyEnvVar(t, &pod, "MJS_WORKER_USERNAME", conf.WorkerUsername) + verifyEnvVar(t, &pod, "MJS_WORKER_PASSWORD", conf.WorkerPassword) + verifyEnvVar(t, &pod, "USER", conf.WorkerUsername) // USER should be set (g3299166) + verifyEnvVar(t, &pod, "SHELL", "/bin/sh") + verifyEnvVar(t, &pod, "WORKER_NAME", testWorker.Name) + verifyEnvVar(t, &pod, "HOSTNAME", testWorker.HostName) + verifyEnvVar(t, &pod, "MDCE_OVERRIDE_INTERNAL_HOSTNAME", testWorker.HostName) + if conf.UsePoolProxy() { + // Make sure the proxy environment variables are set + proxyID := specFactory.CalculatePoolProxyForWorker(testWorker.ID) + proxy := NewPoolProxyInfo(proxyID, conf.PoolProxyBasePort) + verifyEnvVar(t, &pod, "PARALLEL_SERVER_POOL_PROXY_PORT", fmt.Sprintf("%d", proxy.Port)) + verifyEnvVar(t, &pod, "PARALLEL_SERVER_POOL_PROXY_HOST", proxy.Name) + verifyEnvVar(t, &pod, "PARALLEL_SERVER_POOL_PROXY_EXTERNAL_HOST", "$CLIENT_OVERRIDE") + + // Check the proxy certificate variable; this should only be set if USE_SECURE_COMMUNICATION=true + if conf.UseSecureCommunication { + verifyEnvVar(t, &pod, "PARALLEL_SERVER_POOL_PROXY_CERTIFICATE", filepath.Join(proxyCertDir, ProxyCertFileName)) + } else { + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_POOL_PROXY_CERTIFICATE") + } + + // Make sure the port range override is not set; this is only needed for the many-ports configuration + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_OVERRIDE_PORT_RANGE") + } else { + // Make sure none of the proxy environment variables are set + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_POOL_PROXY_PORT") + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_POOL_PROXY_HOST") + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_POOL_PROXY_EXTERNAL_HOST") + verifyEnvUnset(t, &pod, "PARALLEL_SERVER_POOL_PROXY_CERTIFICATE") + + // Check we are using the full DNS name of the service for internal clients + expectedHostname := specFactory.GetServiceHostname(testWorker.HostName) + verifyEnvVar(t, &pod, "MDCE_OVERRIDE_EXTERNAL_HOSTNAME", expectedHostname) + } + + // Create a service spec for the same worker + svc := specFactory.GetWorkerServiceSpec(testWorker) + assert.Equal(deployment.Spec.Template.ObjectMeta.Labels, svc.Spec.Selector, "Service selector should match worker pod's labels") + ownerRefs := svc.ObjectMeta.OwnerReferences + assert.Len(ownerRefs, 1, "Service should have 1 owner reference") + assert.Equal(ownerUID, ownerRefs[0].UID, "Service has incorrect owner UID") + + // Check ports + minPort, maxPort := specFactory.CalculateWorkerPorts() + for p := minPort; p <= maxPort; p++ { + assert.Truef(serviceHasPort(svc, p), "Worker parpool port %d missing from service", p) + } +} + +func verifyProxySpecs(t *testing.T, specFactory *SpecFactory, conf *config.Config, ownerUID types.UID) { + assert := assert.New(t) + testProxy := &PoolProxyInfo{ + Name: "myproxy", + ID: 10, + Port: 40000, + } + + // Create proxy deployment spec + deployment := specFactory.GetPoolProxyDeploymentSpec(testProxy) + verifyDeployment(t, deployment, ownerUID) + assert.Equal(testProxy.Name, deployment.ObjectMeta.Name, "Deployment name should match proxy name") + + // Check the pod has the label used to match it to the corresponding service created in the Helm template + assert.Equal(testProxy.Name, deployment.Spec.Template.ObjectMeta.Labels[PoolProxyLabels.Name], "Proxy pod spec is missing the label required to match it to its corresponding service") + + // Check resource limits/requests + assert.Len(deployment.Spec.Template.Spec.Containers, 1, "Proxy pod should have one container") + container := deployment.Spec.Template.Spec.Containers[0] + assert.Equal(conf.PoolProxyCPULimit, container.Resources.Limits.Cpu().String(), "Proxy container has incorrect CPU limit") + assert.Equal(conf.PoolProxyCPURequest, container.Resources.Requests.Cpu().String(), "Proxy container has incorrect CPU request") + assert.Equal(conf.PoolProxyMemoryLimit, container.Resources.Limits.Memory().String(), "Proxy container has incorrect memory limit") + assert.Equal(conf.PoolProxyMemoryRequest, container.Resources.Requests.Memory().String(), "Proxy container has incorrect memory request") + assert.Equal(conf.PoolProxyImage, container.Image, "Proxy container has unexpected image") + assert.Equal(conf.PoolProxyImagePullPolicy, string(container.ImagePullPolicy), "Proxy container has unexpected image pull policy") + verifyEnableServiceLinksFalse(t, &deployment.Spec.Template.Spec) + + // Check the proxy input args + args := container.Args + assert.Contains(args, fmt.Sprintf("%d", testProxy.Port), "Proxy port should appear in container args") + if conf.UseSecureCommunication { + assert.Contains(args, "--certificate", "Container args should include --certificate flag when UseSecureCommunication is true") + } else { + assert.NotContains(args, "--certificate", "Container args should not include --certificate flag when UseSecureCommunication is false") + } + + // Check volumes + pod := deployment.Spec.Template.Spec + if conf.UseSecureCommunication { + verifyPodHasVolume(t, &pod, proxyCertVolumeName) + } else { + verifyPodDoesNotHaveVolume(t, &pod, proxyCertVolumeName) + } +} + +func verifyJobManagerSpec(t *testing.T, specFactory *SpecFactory, conf *config.Config, ownerUID types.UID) { + assert := assert.New(t) + + // Create the job manager deployment spec + deployment := specFactory.GetJobManagerDeploymentSpec() + assert.NotNil(deployment) + + // Check basic properties + verifyDeployment(t, deployment, ownerUID) + assert.Equal(JobManagerHostname, deployment.Labels[AppKey], "Deployment should have the job manager app label") + assert.Equal(conf.JobManagerUID, deployment.Labels[JobManagerUIDKey], "Deployment should have the job manager UID label") + assert.Equal(JobManagerHostname, deployment.ObjectMeta.Name, "Deployment should have job manager name") + + // Check the underlying pod spec + pod := deployment.Spec.Template.Spec + assert.Len(pod.Containers, 1, "Job manager pod should have 1 container") + container := pod.Containers[0] + assert.Equal(conf.MatlabImage, container.Image, "Job manager container has incorrect image") + assert.Equal(conf.MatlabImagePullPolicy, string(container.ImagePullPolicy), "Job manager container has incorrect image pull policy") + assert.Equal(conf.JobManagerCPULimit, container.Resources.Limits.Cpu().String(), "Job manager container has incorrect CPU limit") + assert.Equal(conf.JobManagerCPURequest, container.Resources.Requests.Cpu().String(), "Job manager container has incorrect CPU request") + assert.Equal(conf.JobManagerMemoryLimit, container.Resources.Limits.Memory().String(), "Job manager container has incorrect memory limit") + assert.Equal(conf.JobManagerMemoryRequest, container.Resources.Requests.Memory().String(), "Job manager container has incorrect memory request") + assert.Contains(container.Command[1], conf.MJSDefDir, "Job manager container arg should point to job manager script under the mjs_def mount directory") + assert.NotNil(container.Lifecycle.PreStop, "Job manager container should have pre-stop hook set") + assert.NotNil(container.LivenessProbe, "Job manager container should have liveness probe") + assert.NotNil(container.StartupProbe, "Job manager container should have startup probe") + assert.Equal(int64(conf.JobManagerUserID), *pod.SecurityContext.RunAsUser, "Job manager pod should run as job manager user UID") + assert.Equal(int64(conf.JobManagerGroupID), *pod.SecurityContext.RunAsGroup, "Job manager pod should run as job manager group UID") + verifyEnableServiceLinksFalse(t, &pod) + + // Check volumes + verifyPodHasVolume(t, &pod, mjsDefVolumeName) + verifyPodHasVolume(t, &pod, checkpointVolumeName) + verifyPodHasVolume(t, &pod, matlabVolumeName) + if conf.UseSecureCommunication { + verifyPodHasVolume(t, &pod, secretVolumeName) + } else { + verifyPodDoesNotHaveVolume(t, &pod, secretVolumeName) + } +} + +// Verify common properties of all created deployment specs +func verifyDeployment(t *testing.T, deployment *appsv1.Deployment, ownerUID types.UID) { + assert := assert.New(t) + assert.Equal(int32(1), *deployment.Spec.Replicas, "Deployment should have 1 replica") + assert.Equal(deployment.Spec.Selector.MatchLabels, deployment.Spec.Template.ObjectMeta.Labels, "Deployment selector labels should match template labels") + ownerRefs := deployment.ObjectMeta.OwnerReferences + assert.Len(ownerRefs, 1, "Deployment should have 1 owner reference") + assert.Equal(ownerUID, ownerRefs[0].UID, "Deployment has incorrect owner UID") + + // Verify volumes common to all pods + pod := deployment.Spec.Template.Spec + verifyPodHasVolume(t, &pod, matlabVolumeName) + verifyPodHasVolume(t, &pod, logVolumeName) +} + +func verifyPodHasVolume(t *testing.T, pod *corev1.PodSpec, volumeName string) { + assert.Truef(t, podHasVolume(pod, volumeName), "Volume %s not found in pod spec", volumeName) + assert.Truef(t, podHasVolumeMount(pod, volumeName), "Volume mount %s not found in container spec", volumeName) +} + +func verifyPodDoesNotHaveVolume(t *testing.T, pod *corev1.PodSpec, volumeName string) { + assert.Falsef(t, podHasVolume(pod, volumeName), "Pod spec should not have volume %s", volumeName) + assert.Falsef(t, podHasVolumeMount(pod, volumeName), "Container spec should not have volume %s", volumeName) +} + +func verifyEnvUnset(t *testing.T, pod *corev1.PodSpec, varName string) { + _, isSet := getPodEnvVar(pod, varName) + assert.Falsef(t, isSet, "Pod should not have environment variable %s", varName) +} + +// Verify that an environment variable is set and has the expected value +func verifyEnvVar(t *testing.T, pod *corev1.PodSpec, key, value string) { + gotValue, isSet := getPodEnvVar(pod, key) + assert.Truef(t, isSet, "Pod should have environment variable %s", key) + assert.Equalf(t, gotValue, value, "Unexpected value for environment variable %s", key) +} + +// Create a Config object populated with test values +func createTestConfig() *config.Config { + return &config.Config{ + BasePort: 20000, + CheckpointPVC: "checkpoint-pvc", + EnableServiceLinks: false, + MatlabImagePullPolicy: "Never", + MatlabImage: "my-matlab-image", + LogBase: "/var/logs", + LogLevel: 3, + MatlabRoot: "/opt/matlab", + MatlabPVC: "matlab-pvc", + MJSDefConfigMap: "mjsdef-cm", + MJSDefDir: "/def-dir", + NetworkLicenseManager: "20000@mylicensemanager", + PortsPerWorker: 3, + PoolProxyImage: "my-proxy", + PoolProxyImagePullPolicy: "Always", + PoolProxyBasePort: 40000, + PoolProxyCPURequest: "500m", + PoolProxyCPULimit: "500m", + PoolProxyMemoryLimit: "2Gi", + PoolProxyMemoryRequest: "1Gi", + WorkerCPURequest: "3", + WorkerCPULimit: "4", + WorkerMemoryRequest: "3Gi", + WorkerMemoryLimit: "4Gi", + JobManagerCPURequest: "3", + JobManagerCPULimit: "4", + JobManagerMemoryRequest: "3Gi", + JobManagerMemoryLimit: "4Gi", + JobManagerGroupID: 1000, + JobManagerUserID: 2000, + WorkerLogPVC: "worker-pvc", + LogPVC: "log-pvc", + WorkerPassword: "workerpw", + WorkersPerPoolProxy: 10, + WorkerUsername: "myuser", + } +} + +func podHasVolume(pod *corev1.PodSpec, volumeName string) bool { + for _, v := range pod.Containers[0].VolumeMounts { + if v.Name == volumeName { + return true + } + } + return false +} + +func podHasVolumeMount(pod *corev1.PodSpec, volumeName string) bool { + for _, v := range pod.Volumes { + if v.Name == volumeName { + return true + } + } + return false +} + +// Return a pod's environment variable value and whether or not it is set +func getPodEnvVar(pod *corev1.PodSpec, varName string) (string, bool) { + for _, e := range pod.Containers[0].Env { + if e.Name == varName { + return e.Value, true + } + } + return "", false +} + +func serviceHasPort(svc *corev1.Service, portNum int) bool { + for _, p := range svc.Spec.Ports { + if p.Port == int32(portNum) { + return true + } + } + return false +} + +func verifyEnableServiceLinksFalse(t *testing.T, pod *corev1.PodSpec) { + assert.Falsef(t, *pod.EnableServiceLinks, "Pod should have EnableServiceLinks set to false") +} diff --git a/controller/src/mocks/k8s/Client.go b/controller/src/mocks/k8s/Client.go new file mode 100644 index 0000000..f1e2154 --- /dev/null +++ b/controller/src/mocks/k8s/Client.go @@ -0,0 +1,1217 @@ +// Code generated by mockery v2.42.2. DO NOT EDIT. + +package mocks + +import ( + bytes "bytes" + + corev1 "k8s.io/api/core/v1" + + mock "github.com/stretchr/testify/mock" + + types "k8s.io/apimachinery/pkg/types" + + v1 "k8s.io/api/apps/v1" +) + +// Client is an autogenerated mock type for the Client type +type Client struct { + mock.Mock +} + +type Client_Expecter struct { + mock *mock.Mock +} + +func (_m *Client) EXPECT() *Client_Expecter { + return &Client_Expecter{mock: &_m.Mock} +} + +// CreateDeployment provides a mock function with given fields: _a0 +func (_m *Client) CreateDeployment(_a0 *v1.Deployment) (*v1.Deployment, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for CreateDeployment") + } + + var r0 *v1.Deployment + var r1 error + if rf, ok := ret.Get(0).(func(*v1.Deployment) (*v1.Deployment, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(*v1.Deployment) *v1.Deployment); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*v1.Deployment) + } + } + + if rf, ok := ret.Get(1).(func(*v1.Deployment) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_CreateDeployment_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'CreateDeployment' +type Client_CreateDeployment_Call struct { + *mock.Call +} + +// CreateDeployment is a helper method to define mock.On call +// - _a0 *v1.Deployment +func (_e *Client_Expecter) CreateDeployment(_a0 interface{}) *Client_CreateDeployment_Call { + return &Client_CreateDeployment_Call{Call: _e.mock.On("CreateDeployment", _a0)} +} + +func (_c *Client_CreateDeployment_Call) Run(run func(_a0 *v1.Deployment)) *Client_CreateDeployment_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(*v1.Deployment)) + }) + return _c +} + +func (_c *Client_CreateDeployment_Call) Return(_a0 *v1.Deployment, _a1 error) *Client_CreateDeployment_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_CreateDeployment_Call) RunAndReturn(run func(*v1.Deployment) (*v1.Deployment, error)) *Client_CreateDeployment_Call { + _c.Call.Return(run) + return _c +} + +// CreateSecret provides a mock function with given fields: _a0 +func (_m *Client) CreateSecret(_a0 *corev1.Secret) (*corev1.Secret, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for CreateSecret") + } + + var r0 *corev1.Secret + var r1 error + if rf, ok := ret.Get(0).(func(*corev1.Secret) (*corev1.Secret, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(*corev1.Secret) *corev1.Secret); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Secret) + } + } + + if rf, ok := ret.Get(1).(func(*corev1.Secret) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_CreateSecret_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'CreateSecret' +type Client_CreateSecret_Call struct { + *mock.Call +} + +// CreateSecret is a helper method to define mock.On call +// - _a0 *corev1.Secret +func (_e *Client_Expecter) CreateSecret(_a0 interface{}) *Client_CreateSecret_Call { + return &Client_CreateSecret_Call{Call: _e.mock.On("CreateSecret", _a0)} +} + +func (_c *Client_CreateSecret_Call) Run(run func(_a0 *corev1.Secret)) *Client_CreateSecret_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(*corev1.Secret)) + }) + return _c +} + +func (_c *Client_CreateSecret_Call) Return(_a0 *corev1.Secret, _a1 error) *Client_CreateSecret_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_CreateSecret_Call) RunAndReturn(run func(*corev1.Secret) (*corev1.Secret, error)) *Client_CreateSecret_Call { + _c.Call.Return(run) + return _c +} + +// CreateService provides a mock function with given fields: _a0 +func (_m *Client) CreateService(_a0 *corev1.Service) (*corev1.Service, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for CreateService") + } + + var r0 *corev1.Service + var r1 error + if rf, ok := ret.Get(0).(func(*corev1.Service) (*corev1.Service, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(*corev1.Service) *corev1.Service); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Service) + } + } + + if rf, ok := ret.Get(1).(func(*corev1.Service) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_CreateService_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'CreateService' +type Client_CreateService_Call struct { + *mock.Call +} + +// CreateService is a helper method to define mock.On call +// - _a0 *corev1.Service +func (_e *Client_Expecter) CreateService(_a0 interface{}) *Client_CreateService_Call { + return &Client_CreateService_Call{Call: _e.mock.On("CreateService", _a0)} +} + +func (_c *Client_CreateService_Call) Run(run func(_a0 *corev1.Service)) *Client_CreateService_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(*corev1.Service)) + }) + return _c +} + +func (_c *Client_CreateService_Call) Return(_a0 *corev1.Service, _a1 error) *Client_CreateService_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_CreateService_Call) RunAndReturn(run func(*corev1.Service) (*corev1.Service, error)) *Client_CreateService_Call { + _c.Call.Return(run) + return _c +} + +// DeleteDeployment provides a mock function with given fields: _a0 +func (_m *Client) DeleteDeployment(_a0 string) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for DeleteDeployment") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Client_DeleteDeployment_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'DeleteDeployment' +type Client_DeleteDeployment_Call struct { + *mock.Call +} + +// DeleteDeployment is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) DeleteDeployment(_a0 interface{}) *Client_DeleteDeployment_Call { + return &Client_DeleteDeployment_Call{Call: _e.mock.On("DeleteDeployment", _a0)} +} + +func (_c *Client_DeleteDeployment_Call) Run(run func(_a0 string)) *Client_DeleteDeployment_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_DeleteDeployment_Call) Return(_a0 error) *Client_DeleteDeployment_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Client_DeleteDeployment_Call) RunAndReturn(run func(string) error) *Client_DeleteDeployment_Call { + _c.Call.Return(run) + return _c +} + +// DeleteSecret provides a mock function with given fields: _a0 +func (_m *Client) DeleteSecret(_a0 string) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for DeleteSecret") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Client_DeleteSecret_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'DeleteSecret' +type Client_DeleteSecret_Call struct { + *mock.Call +} + +// DeleteSecret is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) DeleteSecret(_a0 interface{}) *Client_DeleteSecret_Call { + return &Client_DeleteSecret_Call{Call: _e.mock.On("DeleteSecret", _a0)} +} + +func (_c *Client_DeleteSecret_Call) Run(run func(_a0 string)) *Client_DeleteSecret_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_DeleteSecret_Call) Return(_a0 error) *Client_DeleteSecret_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Client_DeleteSecret_Call) RunAndReturn(run func(string) error) *Client_DeleteSecret_Call { + _c.Call.Return(run) + return _c +} + +// DeleteService provides a mock function with given fields: _a0 +func (_m *Client) DeleteService(_a0 string) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for DeleteService") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Client_DeleteService_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'DeleteService' +type Client_DeleteService_Call struct { + *mock.Call +} + +// DeleteService is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) DeleteService(_a0 interface{}) *Client_DeleteService_Call { + return &Client_DeleteService_Call{Call: _e.mock.On("DeleteService", _a0)} +} + +func (_c *Client_DeleteService_Call) Run(run func(_a0 string)) *Client_DeleteService_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_DeleteService_Call) Return(_a0 error) *Client_DeleteService_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Client_DeleteService_Call) RunAndReturn(run func(string) error) *Client_DeleteService_Call { + _c.Call.Return(run) + return _c +} + +// DeploymentExists provides a mock function with given fields: _a0 +func (_m *Client) DeploymentExists(_a0 string) (bool, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for DeploymentExists") + } + + var r0 bool + var r1 error + if rf, ok := ret.Get(0).(func(string) (bool, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) bool); ok { + r0 = rf(_a0) + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_DeploymentExists_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'DeploymentExists' +type Client_DeploymentExists_Call struct { + *mock.Call +} + +// DeploymentExists is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) DeploymentExists(_a0 interface{}) *Client_DeploymentExists_Call { + return &Client_DeploymentExists_Call{Call: _e.mock.On("DeploymentExists", _a0)} +} + +func (_c *Client_DeploymentExists_Call) Run(run func(_a0 string)) *Client_DeploymentExists_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_DeploymentExists_Call) Return(_a0 bool, _a1 error) *Client_DeploymentExists_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_DeploymentExists_Call) RunAndReturn(run func(string) (bool, error)) *Client_DeploymentExists_Call { + _c.Call.Return(run) + return _c +} + +// ExecOnPod provides a mock function with given fields: _a0, _a1 +func (_m *Client) ExecOnPod(_a0 string, _a1 []string) (*bytes.Buffer, error) { + ret := _m.Called(_a0, _a1) + + if len(ret) == 0 { + panic("no return value specified for ExecOnPod") + } + + var r0 *bytes.Buffer + var r1 error + if rf, ok := ret.Get(0).(func(string, []string) (*bytes.Buffer, error)); ok { + return rf(_a0, _a1) + } + if rf, ok := ret.Get(0).(func(string, []string) *bytes.Buffer); ok { + r0 = rf(_a0, _a1) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*bytes.Buffer) + } + } + + if rf, ok := ret.Get(1).(func(string, []string) error); ok { + r1 = rf(_a0, _a1) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_ExecOnPod_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ExecOnPod' +type Client_ExecOnPod_Call struct { + *mock.Call +} + +// ExecOnPod is a helper method to define mock.On call +// - _a0 string +// - _a1 []string +func (_e *Client_Expecter) ExecOnPod(_a0 interface{}, _a1 interface{}) *Client_ExecOnPod_Call { + return &Client_ExecOnPod_Call{Call: _e.mock.On("ExecOnPod", _a0, _a1)} +} + +func (_c *Client_ExecOnPod_Call) Run(run func(_a0 string, _a1 []string)) *Client_ExecOnPod_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string), args[1].([]string)) + }) + return _c +} + +func (_c *Client_ExecOnPod_Call) Return(_a0 *bytes.Buffer, _a1 error) *Client_ExecOnPod_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_ExecOnPod_Call) RunAndReturn(run func(string, []string) (*bytes.Buffer, error)) *Client_ExecOnPod_Call { + _c.Call.Return(run) + return _c +} + +// GetControllerDeploymentUID provides a mock function with given fields: +func (_m *Client) GetControllerDeploymentUID() (types.UID, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetControllerDeploymentUID") + } + + var r0 types.UID + var r1 error + if rf, ok := ret.Get(0).(func() (types.UID, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() types.UID); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(types.UID) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetControllerDeploymentUID_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetControllerDeploymentUID' +type Client_GetControllerDeploymentUID_Call struct { + *mock.Call +} + +// GetControllerDeploymentUID is a helper method to define mock.On call +func (_e *Client_Expecter) GetControllerDeploymentUID() *Client_GetControllerDeploymentUID_Call { + return &Client_GetControllerDeploymentUID_Call{Call: _e.mock.On("GetControllerDeploymentUID")} +} + +func (_c *Client_GetControllerDeploymentUID_Call) Run(run func()) *Client_GetControllerDeploymentUID_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Client_GetControllerDeploymentUID_Call) Return(_a0 types.UID, _a1 error) *Client_GetControllerDeploymentUID_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetControllerDeploymentUID_Call) RunAndReturn(run func() (types.UID, error)) *Client_GetControllerDeploymentUID_Call { + _c.Call.Return(run) + return _c +} + +// GetDeployment provides a mock function with given fields: _a0 +func (_m *Client) GetDeployment(_a0 string) (*v1.Deployment, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetDeployment") + } + + var r0 *v1.Deployment + var r1 error + if rf, ok := ret.Get(0).(func(string) (*v1.Deployment, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *v1.Deployment); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*v1.Deployment) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetDeployment_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetDeployment' +type Client_GetDeployment_Call struct { + *mock.Call +} + +// GetDeployment is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetDeployment(_a0 interface{}) *Client_GetDeployment_Call { + return &Client_GetDeployment_Call{Call: _e.mock.On("GetDeployment", _a0)} +} + +func (_c *Client_GetDeployment_Call) Run(run func(_a0 string)) *Client_GetDeployment_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetDeployment_Call) Return(_a0 *v1.Deployment, _a1 error) *Client_GetDeployment_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetDeployment_Call) RunAndReturn(run func(string) (*v1.Deployment, error)) *Client_GetDeployment_Call { + _c.Call.Return(run) + return _c +} + +// GetDeploymentsWithLabel provides a mock function with given fields: _a0 +func (_m *Client) GetDeploymentsWithLabel(_a0 string) (*v1.DeploymentList, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetDeploymentsWithLabel") + } + + var r0 *v1.DeploymentList + var r1 error + if rf, ok := ret.Get(0).(func(string) (*v1.DeploymentList, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *v1.DeploymentList); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*v1.DeploymentList) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetDeploymentsWithLabel_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetDeploymentsWithLabel' +type Client_GetDeploymentsWithLabel_Call struct { + *mock.Call +} + +// GetDeploymentsWithLabel is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetDeploymentsWithLabel(_a0 interface{}) *Client_GetDeploymentsWithLabel_Call { + return &Client_GetDeploymentsWithLabel_Call{Call: _e.mock.On("GetDeploymentsWithLabel", _a0)} +} + +func (_c *Client_GetDeploymentsWithLabel_Call) Run(run func(_a0 string)) *Client_GetDeploymentsWithLabel_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetDeploymentsWithLabel_Call) Return(_a0 *v1.DeploymentList, _a1 error) *Client_GetDeploymentsWithLabel_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetDeploymentsWithLabel_Call) RunAndReturn(run func(string) (*v1.DeploymentList, error)) *Client_GetDeploymentsWithLabel_Call { + _c.Call.Return(run) + return _c +} + +// GetJobManagerPod provides a mock function with given fields: +func (_m *Client) GetJobManagerPod() (*corev1.Pod, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetJobManagerPod") + } + + var r0 *corev1.Pod + var r1 error + if rf, ok := ret.Get(0).(func() (*corev1.Pod, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() *corev1.Pod); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Pod) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetJobManagerPod_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetJobManagerPod' +type Client_GetJobManagerPod_Call struct { + *mock.Call +} + +// GetJobManagerPod is a helper method to define mock.On call +func (_e *Client_Expecter) GetJobManagerPod() *Client_GetJobManagerPod_Call { + return &Client_GetJobManagerPod_Call{Call: _e.mock.On("GetJobManagerPod")} +} + +func (_c *Client_GetJobManagerPod_Call) Run(run func()) *Client_GetJobManagerPod_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Client_GetJobManagerPod_Call) Return(_a0 *corev1.Pod, _a1 error) *Client_GetJobManagerPod_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetJobManagerPod_Call) RunAndReturn(run func() (*corev1.Pod, error)) *Client_GetJobManagerPod_Call { + _c.Call.Return(run) + return _c +} + +// GetLoadBalancer provides a mock function with given fields: +func (_m *Client) GetLoadBalancer() (*corev1.Service, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetLoadBalancer") + } + + var r0 *corev1.Service + var r1 error + if rf, ok := ret.Get(0).(func() (*corev1.Service, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() *corev1.Service); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Service) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetLoadBalancer_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetLoadBalancer' +type Client_GetLoadBalancer_Call struct { + *mock.Call +} + +// GetLoadBalancer is a helper method to define mock.On call +func (_e *Client_Expecter) GetLoadBalancer() *Client_GetLoadBalancer_Call { + return &Client_GetLoadBalancer_Call{Call: _e.mock.On("GetLoadBalancer")} +} + +func (_c *Client_GetLoadBalancer_Call) Run(run func()) *Client_GetLoadBalancer_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Client_GetLoadBalancer_Call) Return(_a0 *corev1.Service, _a1 error) *Client_GetLoadBalancer_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetLoadBalancer_Call) RunAndReturn(run func() (*corev1.Service, error)) *Client_GetLoadBalancer_Call { + _c.Call.Return(run) + return _c +} + +// GetPodsWithLabel provides a mock function with given fields: _a0 +func (_m *Client) GetPodsWithLabel(_a0 string) (*corev1.PodList, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetPodsWithLabel") + } + + var r0 *corev1.PodList + var r1 error + if rf, ok := ret.Get(0).(func(string) (*corev1.PodList, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.PodList); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.PodList) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetPodsWithLabel_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetPodsWithLabel' +type Client_GetPodsWithLabel_Call struct { + *mock.Call +} + +// GetPodsWithLabel is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetPodsWithLabel(_a0 interface{}) *Client_GetPodsWithLabel_Call { + return &Client_GetPodsWithLabel_Call{Call: _e.mock.On("GetPodsWithLabel", _a0)} +} + +func (_c *Client_GetPodsWithLabel_Call) Run(run func(_a0 string)) *Client_GetPodsWithLabel_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetPodsWithLabel_Call) Return(_a0 *corev1.PodList, _a1 error) *Client_GetPodsWithLabel_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetPodsWithLabel_Call) RunAndReturn(run func(string) (*corev1.PodList, error)) *Client_GetPodsWithLabel_Call { + _c.Call.Return(run) + return _c +} + +// GetSecret provides a mock function with given fields: _a0 +func (_m *Client) GetSecret(_a0 string) (*corev1.Secret, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetSecret") + } + + var r0 *corev1.Secret + var r1 error + if rf, ok := ret.Get(0).(func(string) (*corev1.Secret, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.Secret); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Secret) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetSecret_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetSecret' +type Client_GetSecret_Call struct { + *mock.Call +} + +// GetSecret is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetSecret(_a0 interface{}) *Client_GetSecret_Call { + return &Client_GetSecret_Call{Call: _e.mock.On("GetSecret", _a0)} +} + +func (_c *Client_GetSecret_Call) Run(run func(_a0 string)) *Client_GetSecret_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetSecret_Call) Return(_a0 *corev1.Secret, _a1 error) *Client_GetSecret_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetSecret_Call) RunAndReturn(run func(string) (*corev1.Secret, error)) *Client_GetSecret_Call { + _c.Call.Return(run) + return _c +} + +// GetService provides a mock function with given fields: _a0 +func (_m *Client) GetService(_a0 string) (*corev1.Service, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetService") + } + + var r0 *corev1.Service + var r1 error + if rf, ok := ret.Get(0).(func(string) (*corev1.Service, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.Service); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Service) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetService_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetService' +type Client_GetService_Call struct { + *mock.Call +} + +// GetService is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetService(_a0 interface{}) *Client_GetService_Call { + return &Client_GetService_Call{Call: _e.mock.On("GetService", _a0)} +} + +func (_c *Client_GetService_Call) Run(run func(_a0 string)) *Client_GetService_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetService_Call) Return(_a0 *corev1.Service, _a1 error) *Client_GetService_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetService_Call) RunAndReturn(run func(string) (*corev1.Service, error)) *Client_GetService_Call { + _c.Call.Return(run) + return _c +} + +// GetServicesWithLabel provides a mock function with given fields: _a0 +func (_m *Client) GetServicesWithLabel(_a0 string) (*corev1.ServiceList, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for GetServicesWithLabel") + } + + var r0 *corev1.ServiceList + var r1 error + if rf, ok := ret.Get(0).(func(string) (*corev1.ServiceList, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.ServiceList); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.ServiceList) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(_a0) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_GetServicesWithLabel_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetServicesWithLabel' +type Client_GetServicesWithLabel_Call struct { + *mock.Call +} + +// GetServicesWithLabel is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) GetServicesWithLabel(_a0 interface{}) *Client_GetServicesWithLabel_Call { + return &Client_GetServicesWithLabel_Call{Call: _e.mock.On("GetServicesWithLabel", _a0)} +} + +func (_c *Client_GetServicesWithLabel_Call) Run(run func(_a0 string)) *Client_GetServicesWithLabel_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_GetServicesWithLabel_Call) Return(_a0 *corev1.ServiceList, _a1 error) *Client_GetServicesWithLabel_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_GetServicesWithLabel_Call) RunAndReturn(run func(string) (*corev1.ServiceList, error)) *Client_GetServicesWithLabel_Call { + _c.Call.Return(run) + return _c +} + +// IsJobManagerReady provides a mock function with given fields: +func (_m *Client) IsJobManagerReady() (bool, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for IsJobManagerReady") + } + + var r0 bool + var r1 error + if rf, ok := ret.Get(0).(func() (bool, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() bool); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(bool) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_IsJobManagerReady_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'IsJobManagerReady' +type Client_IsJobManagerReady_Call struct { + *mock.Call +} + +// IsJobManagerReady is a helper method to define mock.On call +func (_e *Client_Expecter) IsJobManagerReady() *Client_IsJobManagerReady_Call { + return &Client_IsJobManagerReady_Call{Call: _e.mock.On("IsJobManagerReady")} +} + +func (_c *Client_IsJobManagerReady_Call) Run(run func()) *Client_IsJobManagerReady_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Client_IsJobManagerReady_Call) Return(_a0 bool, _a1 error) *Client_IsJobManagerReady_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_IsJobManagerReady_Call) RunAndReturn(run func() (bool, error)) *Client_IsJobManagerReady_Call { + _c.Call.Return(run) + return _c +} + +// SecretExists provides a mock function with given fields: _a0 +func (_m *Client) SecretExists(_a0 string) (*corev1.Secret, bool, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for SecretExists") + } + + var r0 *corev1.Secret + var r1 bool + var r2 error + if rf, ok := ret.Get(0).(func(string) (*corev1.Secret, bool, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.Secret); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Secret) + } + } + + if rf, ok := ret.Get(1).(func(string) bool); ok { + r1 = rf(_a0) + } else { + r1 = ret.Get(1).(bool) + } + + if rf, ok := ret.Get(2).(func(string) error); ok { + r2 = rf(_a0) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// Client_SecretExists_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SecretExists' +type Client_SecretExists_Call struct { + *mock.Call +} + +// SecretExists is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) SecretExists(_a0 interface{}) *Client_SecretExists_Call { + return &Client_SecretExists_Call{Call: _e.mock.On("SecretExists", _a0)} +} + +func (_c *Client_SecretExists_Call) Run(run func(_a0 string)) *Client_SecretExists_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_SecretExists_Call) Return(_a0 *corev1.Secret, _a1 bool, _a2 error) *Client_SecretExists_Call { + _c.Call.Return(_a0, _a1, _a2) + return _c +} + +func (_c *Client_SecretExists_Call) RunAndReturn(run func(string) (*corev1.Secret, bool, error)) *Client_SecretExists_Call { + _c.Call.Return(run) + return _c +} + +// ServiceExists provides a mock function with given fields: _a0 +func (_m *Client) ServiceExists(_a0 string) (*corev1.Service, bool, error) { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for ServiceExists") + } + + var r0 *corev1.Service + var r1 bool + var r2 error + if rf, ok := ret.Get(0).(func(string) (*corev1.Service, bool, error)); ok { + return rf(_a0) + } + if rf, ok := ret.Get(0).(func(string) *corev1.Service); ok { + r0 = rf(_a0) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*corev1.Service) + } + } + + if rf, ok := ret.Get(1).(func(string) bool); ok { + r1 = rf(_a0) + } else { + r1 = ret.Get(1).(bool) + } + + if rf, ok := ret.Get(2).(func(string) error); ok { + r2 = rf(_a0) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// Client_ServiceExists_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ServiceExists' +type Client_ServiceExists_Call struct { + *mock.Call +} + +// ServiceExists is a helper method to define mock.On call +// - _a0 string +func (_e *Client_Expecter) ServiceExists(_a0 interface{}) *Client_ServiceExists_Call { + return &Client_ServiceExists_Call{Call: _e.mock.On("ServiceExists", _a0)} +} + +func (_c *Client_ServiceExists_Call) Run(run func(_a0 string)) *Client_ServiceExists_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Client_ServiceExists_Call) Return(_a0 *corev1.Service, _a1 bool, _a2 error) *Client_ServiceExists_Call { + _c.Call.Return(_a0, _a1, _a2) + return _c +} + +func (_c *Client_ServiceExists_Call) RunAndReturn(run func(string) (*corev1.Service, bool, error)) *Client_ServiceExists_Call { + _c.Call.Return(run) + return _c +} + +// UpdateService provides a mock function with given fields: _a0 +func (_m *Client) UpdateService(_a0 *corev1.Service) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for UpdateService") + } + + var r0 error + if rf, ok := ret.Get(0).(func(*corev1.Service) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Client_UpdateService_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'UpdateService' +type Client_UpdateService_Call struct { + *mock.Call +} + +// UpdateService is a helper method to define mock.On call +// - _a0 *corev1.Service +func (_e *Client_Expecter) UpdateService(_a0 interface{}) *Client_UpdateService_Call { + return &Client_UpdateService_Call{Call: _e.mock.On("UpdateService", _a0)} +} + +func (_c *Client_UpdateService_Call) Run(run func(_a0 *corev1.Service)) *Client_UpdateService_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(*corev1.Service)) + }) + return _c +} + +func (_c *Client_UpdateService_Call) Return(_a0 error) *Client_UpdateService_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Client_UpdateService_Call) RunAndReturn(run func(*corev1.Service) error) *Client_UpdateService_Call { + _c.Call.Return(run) + return _c +} + +// NewClient creates a new instance of Client. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewClient(t interface { + mock.TestingT + Cleanup(func()) +}) *Client { + mock := &Client{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/controller/src/mocks/request/Getter.go b/controller/src/mocks/request/Getter.go new file mode 100644 index 0000000..b1b6e54 --- /dev/null +++ b/controller/src/mocks/request/Getter.go @@ -0,0 +1,93 @@ +// Code generated by mockery v2.42.2. DO NOT EDIT. + +package mocks + +import ( + request "controller/internal/request" + + mock "github.com/stretchr/testify/mock" +) + +// Getter is an autogenerated mock type for the Getter type +type Getter struct { + mock.Mock +} + +type Getter_Expecter struct { + mock *mock.Mock +} + +func (_m *Getter) EXPECT() *Getter_Expecter { + return &Getter_Expecter{mock: &_m.Mock} +} + +// GetRequest provides a mock function with given fields: +func (_m *Getter) GetRequest() (*request.ResizeRequest, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetRequest") + } + + var r0 *request.ResizeRequest + var r1 error + if rf, ok := ret.Get(0).(func() (*request.ResizeRequest, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() *request.ResizeRequest); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*request.ResizeRequest) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Getter_GetRequest_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetRequest' +type Getter_GetRequest_Call struct { + *mock.Call +} + +// GetRequest is a helper method to define mock.On call +func (_e *Getter_Expecter) GetRequest() *Getter_GetRequest_Call { + return &Getter_GetRequest_Call{Call: _e.mock.On("GetRequest")} +} + +func (_c *Getter_GetRequest_Call) Run(run func()) *Getter_GetRequest_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Getter_GetRequest_Call) Return(_a0 *request.ResizeRequest, _a1 error) *Getter_GetRequest_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Getter_GetRequest_Call) RunAndReturn(run func() (*request.ResizeRequest, error)) *Getter_GetRequest_Call { + _c.Call.Return(run) + return _c +} + +// NewGetter creates a new instance of Getter. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewGetter(t interface { + mock.TestingT + Cleanup(func()) +}) *Getter { + mock := &Getter{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/controller/src/mocks/rescaler/Rescaler.go b/controller/src/mocks/rescaler/Rescaler.go new file mode 100644 index 0000000..e4a41e0 --- /dev/null +++ b/controller/src/mocks/rescaler/Rescaler.go @@ -0,0 +1,64 @@ +// Code generated by mockery v2.42.2. DO NOT EDIT. + +package mocks + +import mock "github.com/stretchr/testify/mock" + +// Rescaler is an autogenerated mock type for the Rescaler type +type Rescaler struct { + mock.Mock +} + +type Rescaler_Expecter struct { + mock *mock.Mock +} + +func (_m *Rescaler) EXPECT() *Rescaler_Expecter { + return &Rescaler_Expecter{mock: &_m.Mock} +} + +// Rescale provides a mock function with given fields: +func (_m *Rescaler) Rescale() { + _m.Called() +} + +// Rescaler_Rescale_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Rescale' +type Rescaler_Rescale_Call struct { + *mock.Call +} + +// Rescale is a helper method to define mock.On call +func (_e *Rescaler_Expecter) Rescale() *Rescaler_Rescale_Call { + return &Rescaler_Rescale_Call{Call: _e.mock.On("Rescale")} +} + +func (_c *Rescaler_Rescale_Call) Run(run func()) *Rescaler_Rescale_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Rescaler_Rescale_Call) Return() *Rescaler_Rescale_Call { + _c.Call.Return() + return _c +} + +func (_c *Rescaler_Rescale_Call) RunAndReturn(run func()) *Rescaler_Rescale_Call { + _c.Call.Return(run) + return _c +} + +// NewRescaler creates a new instance of Rescaler. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewRescaler(t interface { + mock.TestingT + Cleanup(func()) +}) *Rescaler { + mock := &Rescaler{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/controller/src/mocks/resize/Resizer.go b/controller/src/mocks/resize/Resizer.go new file mode 100644 index 0000000..9290299 --- /dev/null +++ b/controller/src/mocks/resize/Resizer.go @@ -0,0 +1,187 @@ +// Code generated by mockery v2.42.2. DO NOT EDIT. + +package mocks + +import ( + resize "controller/internal/resize" + + mock "github.com/stretchr/testify/mock" + + specs "controller/internal/specs" +) + +// Resizer is an autogenerated mock type for the Resizer type +type Resizer struct { + mock.Mock +} + +type Resizer_Expecter struct { + mock *mock.Mock +} + +func (_m *Resizer) EXPECT() *Resizer_Expecter { + return &Resizer_Expecter{mock: &_m.Mock} +} + +// AddWorkers provides a mock function with given fields: _a0 +func (_m *Resizer) AddWorkers(_a0 []specs.WorkerInfo) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for AddWorkers") + } + + var r0 error + if rf, ok := ret.Get(0).(func([]specs.WorkerInfo) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Resizer_AddWorkers_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'AddWorkers' +type Resizer_AddWorkers_Call struct { + *mock.Call +} + +// AddWorkers is a helper method to define mock.On call +// - _a0 []specs.WorkerInfo +func (_e *Resizer_Expecter) AddWorkers(_a0 interface{}) *Resizer_AddWorkers_Call { + return &Resizer_AddWorkers_Call{Call: _e.mock.On("AddWorkers", _a0)} +} + +func (_c *Resizer_AddWorkers_Call) Run(run func(_a0 []specs.WorkerInfo)) *Resizer_AddWorkers_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].([]specs.WorkerInfo)) + }) + return _c +} + +func (_c *Resizer_AddWorkers_Call) Return(_a0 error) *Resizer_AddWorkers_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Resizer_AddWorkers_Call) RunAndReturn(run func([]specs.WorkerInfo) error) *Resizer_AddWorkers_Call { + _c.Call.Return(run) + return _c +} + +// DeleteWorkers provides a mock function with given fields: _a0 +func (_m *Resizer) DeleteWorkers(_a0 []string) error { + ret := _m.Called(_a0) + + if len(ret) == 0 { + panic("no return value specified for DeleteWorkers") + } + + var r0 error + if rf, ok := ret.Get(0).(func([]string) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Resizer_DeleteWorkers_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'DeleteWorkers' +type Resizer_DeleteWorkers_Call struct { + *mock.Call +} + +// DeleteWorkers is a helper method to define mock.On call +// - _a0 []string +func (_e *Resizer_Expecter) DeleteWorkers(_a0 interface{}) *Resizer_DeleteWorkers_Call { + return &Resizer_DeleteWorkers_Call{Call: _e.mock.On("DeleteWorkers", _a0)} +} + +func (_c *Resizer_DeleteWorkers_Call) Run(run func(_a0 []string)) *Resizer_DeleteWorkers_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].([]string)) + }) + return _c +} + +func (_c *Resizer_DeleteWorkers_Call) Return(_a0 error) *Resizer_DeleteWorkers_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Resizer_DeleteWorkers_Call) RunAndReturn(run func([]string) error) *Resizer_DeleteWorkers_Call { + _c.Call.Return(run) + return _c +} + +// GetWorkers provides a mock function with given fields: +func (_m *Resizer) GetWorkers() ([]resize.Worker, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetWorkers") + } + + var r0 []resize.Worker + var r1 error + if rf, ok := ret.Get(0).(func() ([]resize.Worker, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() []resize.Worker); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]resize.Worker) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Resizer_GetWorkers_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetWorkers' +type Resizer_GetWorkers_Call struct { + *mock.Call +} + +// GetWorkers is a helper method to define mock.On call +func (_e *Resizer_Expecter) GetWorkers() *Resizer_GetWorkers_Call { + return &Resizer_GetWorkers_Call{Call: _e.mock.On("GetWorkers")} +} + +func (_c *Resizer_GetWorkers_Call) Run(run func()) *Resizer_GetWorkers_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Resizer_GetWorkers_Call) Return(_a0 []resize.Worker, _a1 error) *Resizer_GetWorkers_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Resizer_GetWorkers_Call) RunAndReturn(run func() ([]resize.Worker, error)) *Resizer_GetWorkers_Call { + _c.Call.Return(run) + return _c +} + +// NewResizer creates a new instance of Resizer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewResizer(t interface { + mock.TestingT + Cleanup(func()) +}) *Resizer { + mock := &Resizer{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/helm_values.md b/helm_values.md new file mode 100644 index 0000000..e6d571b --- /dev/null +++ b/helm_values.md @@ -0,0 +1,59 @@ +# Helm Values for MATLAB Parallel Server in Kubernetes + +The following table lists the configurable Helm values that you can set in the YAML file you use to configure MATLAB Parallel Server in Kubernetes. +If you do not include a parameter in your YAML file, your configuration uses the default value. + + +**Parameter** | **Description** | **Default Value** +----------------------------|-----------------|------------------- +`autoCreateLoadBalancer` | Flag to automatically create a Kubernetes load balancer to expose MATLAB Job Scheduler to MATLAB clients outside the cluster. See the [Customize Load Balancer](README.md#customize-load-balancer) section for instructions to create your own load balancer. | `true` +`autoScalingPeriod` | Period in seconds with which the controller checks the cluster's size requirements and automatically scales the number of workers up and down if needed. | `15` +`basePort` | Base port of the MATLAB Job Scheduler service. | `27350` +`checkpointPVC` | Name of the PersistentVolumeClaim that is bound to the PersistentVolume used to retain job data. | — +`clusterHost` | Custom host to use in the cluster profile. If unset, the cluster profile uses the external address of the load balancer. | — +`controllerImage` | URI of the image to use for the MATLAB Job Scheduler controller, a pod that creates the job manager and automatically scales the number workers up and down. Set this value if you want to use a privately hosted version of this image rather than the version hosted on the GitHub Container registry. | `ghcr.io/mathworks-ref-arch/matlab-parallel-server-k8s/mjs-controller-image` +`controllerImagePullPolicy` | Pull policy for the MATLAB Job Scheduler controller. | `IfNotPresent` +`controllerImageTag` | Tag of the image to use for the MATLAB Job Scheduler controller. If you do not set this value, the Helm chart uses the `appVersion` defined in `Chart.yaml` as the tag. | — +`haproxyImage` | URI of the [HAproxy Docker image](https://hub.docker.com/_/haproxy/), which is used to proxy incoming traffic. Set this value if you want to use a privately hosted version of this image. | `haproxy` +`haproxyImagePullPolicy` | Pull policy for the HAproxy image. | `IfNotPresent` +`idleStop` | Time in seconds after which idle worker pods are removed. | `300` +`internalClientsOnly` | Flag to allow only MATLAB clients running inside the Kubernetes cluster to connect to the MATLAB Job Scheduler. | `false` +`jobManagerCPULimit` | CPU limit for the job manager pod. | — +`jobManagerCPURequest` | CPU request for the job manager pod. | `1` +`jobManagerGroupID` | Group ID of the user account that MATLAB Job Scheduler uses to run the job manager pod. The user must have write permission for the checkpoint and log PersistentVolumes. To find the group ID, on a Linux machine, run the command `id -g` in the terminal. | `0` +`jobManagerMemoryLimit` | Memory limit for the job manager pod. | — +`jobManagerMemoryRequest` | Memory request for the job manager pod. | `4Gi` +`jobManagerName` | Name of the MATLAB Job Scheduler job manager. | `MJS_Kubernetes` +`jobManagerUserID` | User ID of the user account that MATLAB Job Scheduler uses to run the job manager pod. The user must have write permission for the checkpoint and log PersistentVolumes. To find the user ID, on a Linux machine, run `id -u` in the terminal. | `0` +`logLevel` | Verbosity level of MATLAB Job Scheduler logging. | `0` +`logPVC` | Name of the PersistentVolumeClaim that is bound to the PersistentVolume used to retain job manager logs. | — +`matlabImage` | URI of the image to use for the MATLAB Job Scheduler pods. If unset, your configuration uses the [`mathworks/matlab-deps`](https://hub.docker.com/r/mathworks/matlab-deps) image. Set this value if you built a custom Docker image containing a MATLAB Parallel Server installation or if you want to use a privately hosted version of the `mathworks/matlab-deps` image. | — +`matlabImagePullPolicy` | Pull policy for the MATLAB image. | `IfNotPresent` +`matlabPVC` | Name of the PersistentVolumeClaim that is bound to the PersistentVolume with a MATLAB Parallel Server installation. Set this option only if you did not build a Docker image containing a MATLAB Parallel Server installation. | — +`matlabRelease` | Release number of the MATLAB version to use. | `r2024a` +`maxWorkers` | Maximum number of workers that the cluster can automatically resize to. | — +`minWorkers` | Minimum number of workers to run in the cluster. | `0` +`networkLicenseManager` | Address of a network license manager with format `port@host`. | — +`poolProxyBasePort` | Base port for the parallel pool proxy pods. | `30000` +`poolProxyCPULimit` | CPU limit for each parallel pool proxy pod. | — +`poolProxyCPURequest` | CPU request for each parallel pool proxy pod. | `0.5` +`poolProxyImage` | URI of the image to use for pods that proxy connections in interactive parallel pools. Set this value if you want to use a privately hosted version of this image rather than the version hosted by MathWorks. | `containers.mathworks.com/matlab-parallel-server-k8s/parallel-server-proxy-image` +`poolProxyImagePullPolicy` | Pull policy for the pool proxy image. | `IfNotPresent` +`poolProxyImageTag` | Tag of the image to use for the pool proxy. If you do not set this value, the Helm chart uses the `matlabRelease` parameter as the tag. | — +`poolProxyMemoryLimit` | Memory limit for each parallel pool proxy pod. | — +`poolProxyMemoryRequest` | Memory request for each parallel pool proxy pod. | `500Mi` +`requireClientCertificate` | Flag that requires MATLAB clients to have a certificate to connect to the job manager. | `true` +`requireScriptVerification` | Flag that requires verification to run privileged commands on the cluster. | `true` +`securityLevel` | MATLAB Job Scheduler security level. | `2` +`stopWorkerGracePeriod` | Grace period in seconds for stopping worker pods. | `60` +`useOnlineLicensing` | Flag to use online licensing. | `false` +`useSecureCommunication` | Flag to use secure communication between job manager and workers. | `true` +`workerCPULimit` | CPU limit for each worker pod. | — +`workerCPURequest` | CPU request for each worker pod. | `2` +`workerLogPVC` | Name of the PersistentVolumeClaim that is bound to the PersistentVolume used to retain worker logs. | — +`workerMemoryLimit` | Memory limit for each worker pod. | `8Gi` +`workerMemoryRequest` | Memory request for each worker pod. | `8Gi` +`workerPassword` | Password of the username that MATLAB Parallel Server uses to run jobs. | `matlab` +`workerUsername` | Username that MATLAB Parallel Server uses to run jobs. | `matlab` +`workersPerPoolProxy` | Maximum number of workers using each parallel pool proxy. | `32` + diff --git a/mjs/Chart.yaml b/mjs/Chart.yaml new file mode 100644 index 0000000..cdccb10 --- /dev/null +++ b/mjs/Chart.yaml @@ -0,0 +1,7 @@ +# Copyright 2024 The MathWorks, Inc. +apiVersion: v2 +name: mjs +description: A Helm chart for MATLAB (R) Job Scheduler in Kubernetes +type: application +version: 1.0.0 +appVersion: 1.0.0 diff --git a/mjs/templates/haproxy.yaml b/mjs/templates/haproxy.yaml new file mode 100644 index 0000000..de9e35d --- /dev/null +++ b/mjs/templates/haproxy.yaml @@ -0,0 +1,165 @@ +# Template for a deployment running HAproxy. +# This proxies incoming connections to the job manager +# or workers via a single external load balancer. +# Copyright 2024 The MathWorks, Inc. +{{- if (not .Values.internalClientsOnly) }} # Only need HAproxy if we support clients outside of Kubernetes + +# Create the HAproxy config file; this configures +# proxying of connections based on TCP port. +{{- $poolProxyPrefix := "mjs-pool-proxy" }} +{{- $configMapName := "haproxy-config" }} +{{- $configFileName := "haproxy.cfg" }} +{{- $numProxies := divf .Values.maxWorkers .Values.workersPerPoolProxy | ceil | int }} +{{- $minPortAllWorkers := add $.Values.basePort 10 | int }} # Minimum worker port if not using a parallel pool proxy +{{- $name := "mjs-ingress-proxy" }} +{{- $lookupPort := add .Values.basePort 6 | int }} +{{- $jobManagerPort := add .Values.basePort 9 | int }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ $configMapName }} +data: + {{ $configFileName }}: | + global + log stdout local0 + defaults + default-server init-addr last,libc,none + log global + option tcplog + mode tcp + timeout connect 30s + timeout client 300s + timeout server 300s + # Rules for proxying traffic to the job manager pod. + frontend front-lookup + bind {{ printf "*:%d" $lookupPort }} + default_backend back-mjs-job-manager + frontend front-jobmanager + bind {{ printf "*:%d" $jobManagerPort }} + default_backend back-mjs-job-manager + backend back-mjs-job-manager + server mjs-job-manager mjs-job-manager +{{- if .Values.usePoolProxy | default true }} + # Rules for proxying traffic to the parallel pool proxies. + # Each parallel pool proxy has a unique port, which should be mapped to the + # corresponding Kubernetes service for that proxy. + {{- range untilStep 0 $numProxies 1 }} + {{- $portNum := add $.Values.poolProxyBasePort . | int }} + {{- $poolProxyName := printf "%s-%d" $poolProxyPrefix (add . 1) }} + frontend front-{{ $poolProxyName }} + bind {{ printf "*:%d" $portNum }} + default_backend back-{{ $poolProxyName }} + backend back-{{ $poolProxyName }} + server {{ $poolProxyName }} {{ $poolProxyName }} + {{- end }} +{{- else }} + # Rules for proxying parallel pool traffic to the workers. + # Each worker has a set of unique ports, which should be mapped to the + # corresponding Kubernetes service for that worker. + {{- range untilStep 0 (.Values.maxWorkers | int) 1 }} + {{- $workerName := printf "mjs-worker-%d" (add . 1) }} + {{- $minPort := add $minPortAllWorkers (mul $.Values.portsPerWorker .) | int }} + {{- range untilStep 0 ($.Values.portsPerWorker | int) 1 }} + frontend {{ printf "front-%s-%d" $workerName . }} + bind {{ printf "*:%d" (add $minPort .) }} + default_backend back-{{ $workerName }} + {{- end }} + backend back-{{ $workerName }} + server {{ $workerName }} {{ $workerName }} + {{- end }} +{{- end }} +--- + +# Create the HAproxy deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $name }} +spec: + selector: + matchLabels: + app: {{ $name }} + replicas: 1 + template: + metadata: + labels: + app: {{ $name }} + spec: + # If set to false, disable creation of environment variables for services + enableServiceLinks: {{ .Values.enableServicelinks | default false }} + + containers: + - name: haproxy + image: {{ $.Values.haproxyImage }} + imagePullPolicy: {{ $.Values.haproxyImagePullPolicy }} + + # Pass the config file path as an input argument + {{- $configDir := "/usr/local/etc/haproxy/" }} + args: + - "-f" + - {{ printf "%s/%s" $configDir $configFileName }} + + # Mount the config file from the ConfigMap + {{- $configVolName := "config-volume" }} + volumeMounts: + - name: {{ $configVolName }} + mountPath: {{ $configDir }} + + volumes: + - name: {{ $configVolName }} + configMap: + name: {{ $configMapName }} +--- + +# Create a Kubernetes Service for each HAproxy backend +# (HAproxy will error if a backend cannot be found) +{{- if .Values.usePoolProxy | default true }} + {{- range untilStep 0 $numProxies 1 }} + {{- $portNum := add $.Values.poolProxyBasePort . | int }} + {{- $poolProxyNum := add . 1 }} + {{- $poolProxyName := printf "%s-%d" $poolProxyPrefix $poolProxyNum }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $poolProxyName }} + labels: + app: {{ $poolProxyPrefix }} + proxyName: {{ $poolProxyName }} + port: {{ $portNum | quote }} + proxyID: {{ $poolProxyNum | quote }} +spec: + type: ClusterIP + selector: + proxyName: {{ $poolProxyName }} + ports: + - protocol: TCP + port: {{ $portNum }} + targetPort: {{ $portNum }} +--- + {{- end }} +{{- else }} + {{- $minPortAllWorkers := add $.Values.basePort 10 | int }} + {{- range untilStep 0 (.Values.maxWorkers | int) 1 }} + {{- $workerName := printf "mjs-worker-%d" (add . 1) }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $workerName }} +spec: + # Match to the MJS worker pod + selector: + workerName: {{ $workerName }} + + # Expose unique pool ports needed by this worker + ports: + {{- $minPort := add $minPortAllWorkers (mul $.Values.portsPerWorker .) | int }} + {{- range untilStep 0 ($.Values.portsPerWorker | int) 1 }} + - name: {{ printf "tcp-%d" . }} + protocol: TCP + port: {{ add $minPort . }} + targetPort: {{ add $minPort . }} + {{- end }} + {{- end }} +--- +{{- end }} +{{- end }} diff --git a/mjs/templates/mjs.yaml b/mjs/templates/mjs.yaml new file mode 100644 index 0000000..8de754e --- /dev/null +++ b/mjs/templates/mjs.yaml @@ -0,0 +1,377 @@ +# Templates for the MATLAB Job Scheduler config files and controller deployment. +# Copyright 2024 The MathWorks, Inc. + +{{- $jobManagerHostname := "mjs-job-manager" }} +{{- $logBase := "/mjs/log" }} +{{- $isNonRoot := ne (int .Values.jobManagerUserID) 0 }} +{{- if (and $isNonRoot (empty .Values.logPVC)) }} # If running as non-root user and not mounting log directory, use a directory that a non-root user can create +{{- $logBase = "/tmp/log" }} +{{- end }} +{{- $matlabRoot := "/opt/matlab" }} +{{- $mjsDefDir := "/mjs/config" }} +{{- $secretDir := "/mjs/secret" }} +{{- $secretFileName := "secret.json" }} +{{- $certFileName := "certificate.json" }} +{{- $checkpointBase := "/mjs/checkpoint" }} +{{- if (and $isNonRoot (empty .Values.checkpointPVC)) }} # If running as non-root user and not mounting checkpointbase, use a directory that a non-root user can create +{{- $checkpointBase = "/tmp/checkpoint" }} +{{- end }} +{{- $localMJSDef := "/tmp/mjs_def.sh" }} +{{- $secretFile := printf "%s/%s" $secretDir $secretFileName }} +{{- $certFile := printf "%s/%s" $secretDir $certFileName }} +{{- $binDir := printf "%s/toolbox/parallel/bin/" $matlabRoot }} +{{- $jobManagerUID := uuidv4 }} +{{- $mjsVolName := "mjs-volume" }} +{{- $logVolName := "log-volume" }} +{{- $checkpointVolName := "checkpoint-volume" }} +{{- $matlabVolName := "matlab-volume" }} +{{- $secretVolName := "secret-volume" }} +{{- $configVolName := "config-volume" }} +{{- $workerStartedFile := "/tmp/worker-started" }} +{{- $matlabImage := .Values.matlabImage | default (printf "mathworks/matlab-deps:%s" .Values.matlabRelease) }} +{{- $basePort := .Values.basePort | int }} +{{- $enableServiceLinks := .Values.enableServiceLinks | default false }} +{{- $commandListenerRestartHours := .Values.commandListenerRestartPeriod | default 12 }} +{{- $commandListenerRestartSeconds := mulf $commandListenerRestartHours 3600 }} + +# Create a ConfigMap containing scripts for the job manager pod and worker pods +{{- $mjsConfigMap := "mjs-config" }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ $mjsConfigMap }} +data: + mjs_def.sh: | + # Common mjs_def.sh file for the job manager and workers + BASE_PORT={{ .Values.basePort }} + MAX_LINUX_WORKERS={{ .Values.maxWorkers }} + CHECKPOINTBASE={{ $checkpointBase }} + PIDBASE=/tmp/pid + USE_SECURE_COMMUNICATION={{ .Values.useSecureCommunication }} + REQUIRE_CLIENT_CERTIFICATE={{ .Values.requireClientCertificate }} + REQUIRE_SCRIPT_VERIFICATION={{ .Values.requireScriptVerification }} + SECURITY_LEVEL={{ .Values.securityLevel }} + SHARED_SECRET_FILE={{ $secretFile }} + USE_ONLINE_LICENSING={{ .Values.useOnlineLicensing }} + jobManager.sh: | + # Script to run on the MATLAB Job Scheduler job manager pod + # Set up the mjs_def file + cp {{ printf "%s/mjs_def.sh" $mjsDefDir }} {{ $localMJSDef }} + echo "HOSTNAME={{ $jobManagerHostname }}" >> {{ $localMJSDef }} + echo "LOGBASE={{ $logBase }}" >> {{ $localMJSDef }} + + # Start the MJS service + {{ printf "%s/mjs" $binDir }} start -mjsdef {{ $localMJSDef }} -loglevel {{ .Values.logLevel }} -disablereliableconnections + + # Start the job manager + {{- $jmCommand := printf "%s/startjobmanager -name \"%s\" -baseport %d" $binDir .Values.jobManagerName $basePort }} + {{- if .Values.requireClientCertificate }} + {{- $jmCommand = printf "%s -certificate \"%s\"" $jmCommand $certFile }} + {{- end }} + {{- if .Values.requireScriptVerification }} + {{- $jmCommand = printf "%s -secretfile \"%s\"" $jmCommand $secretFile }} + {{- end }} + {{ $jmCommand }} || echo "startjobmanager failed; there may already be a job manager running" + + # Keep the container running + sleep infinity + worker.sh: | + # Script to run on worker pods + # Set up the mjs_def file + cp {{ printf "%s/mjs_def.sh" $mjsDefDir }} {{ $localMJSDef }} + LOGBASE={{ $logBase }}/${WORKER_NAME} + echo "LOGBASE=${LOGBASE}" >> {{ $localMJSDef }} + echo "HOSTNAME=${HOSTNAME}" >> {{ $localMJSDef }} + + # Ensure log directory exists and is writeable by workers + if [ ! -d "${LOGBASE}" ]; then + mkdir "${LOGBASE}" + fi + chmod o+w "${LOGBASE}" + + # Create a user to run MATLAB as + useradd --create-home {{ .Values.workerUsername }} + echo {{ printf "%s:%s" .Values.workerUsername .Values.workerPassword }} | chpasswd + + # Start the MJS service + {{ printf "%s/mjs" $binDir }} start -mjsdef {{ $localMJSDef }} -loglevel {{ .Values.logLevel }} -disablereliableconnections + + # Start the worker + {{- $workerCmd := printf "%s/startworker -jobmanager \"%s\" -jobmanagerhost \"%s\" -name \"${WORKER_NAME}\" -baseport %d" $binDir .Values.jobManagerName $jobManagerHostname $basePort }} + {{- if .Values.requireScriptVerification }} + {{- $workerCmd = printf "%s -secretfile \"%s\"" $workerCmd $secretFile }} + {{- end }} + {{ $workerCmd }} + + # Add a file to indicate that startworker is complete + touch {{ $workerStartedFile }} + + # Keep the container running + sleep infinity + stopWorker.sh: | + # Script to gracefully shut down a worker + # First, wait for startworker to have finished successfully + while [ ! -f {{ $workerStartedFile | quote }} ]]; do + echo "Waiting for startworker to finish" + sleep 1 + done + + # Stop the worker + {{- $stopCmd := printf "%s/stopworker -clean -name ${WORKER_NAME}" $binDir }} + {{- if .Values.requireScriptVerification }} + {{- $stopCmd = printf "%s -secretfile \"%s\"" $stopCmd $secretFile }} + {{- end }} + {{ $stopCmd }} +--- + +{{- $loadBalancerPrefix := "mjs-ingress-proxy" }} +{{- $loadBalancerName := $loadBalancerPrefix }} +{{- if .Values.autoCreateLoadBalancer }} +{{- $loadBalancerName = printf "%s-%s" $loadBalancerPrefix $jobManagerUID | trunc 63 }} # Use a unique name for the auto-generated service +# Create a LoadBalancer service to route external traffic to HAproxy +{{- $lookupPort := add .Values.basePort 6 | int }} +{{- $jobManagerPort := add .Values.basePort 9 | int }} +{{- $numProxies := divf .Values.maxWorkers .Values.workersPerPoolProxy | ceil | int }} +{{- if (not .Values.internalClientsOnly) }} # Load balancer only needed for external clients +apiVersion: v1 +kind: Service +metadata: + name: {{ $loadBalancerName }} + labels: + app: {{ $loadBalancerPrefix }} +spec: + type: LoadBalancer + selector: + app: {{ $loadBalancerPrefix }} + ports: + # Job manager ports + - name: "tcp-lookup" + protocol: TCP + appProtocol: TCP + port: {{ $lookupPort }} + targetPort: {{ $lookupPort }} + - name: "tcp-jobmanager" + protocol: TCP + appProtocol: TCP + port: {{ $jobManagerPort }} + targetPort: {{ $jobManagerPort }} + + # Pool proxy ports +{{- range untilStep 0 $numProxies 1 }} + {{- $poolProxyPort := add $.Values.poolProxyBasePort . }} + - name: {{ printf "tcp-pool-proxy-%d" (add . 1) }} + protocol: TCP + appProtocol: TCP + port: {{ $poolProxyPort }} + targetPort: {{ $poolProxyPort }} +{{- end }} +--- +{{- end }} +{{- end }} + +# Create an internal Service for workers to use to communicate with the job manager +apiVersion: v1 +kind: Service +metadata: + name: {{ $jobManagerHostname }} + labels: + app: {{ $jobManagerHostname }} +spec: + type: ClusterIP + + # Match the job manager pod + selector: + app: {{ $jobManagerHostname }} + job-manager-uid: {{ $jobManagerUID }} + + ports: + {{- $minPort := $basePort }} + {{- $maxPort := add $minPort 10 | int }} + {{- range untilStep $minPort $maxPort 1 }} + - name: {{ printf "tcp-%d" . }} + protocol: TCP + appProtocol: TCP + port: {{ . }} + targetPort: {{ . }} + {{- end }} +--- + +# Create the controller config file +{{- $controllerName := "mjs-controller" }} +{{- $controllerConfigMap := printf "%s-config" $controllerName }} +{{- $configFileName := "config.json" }} +{{- $controllerLogFile := printf "%s/controller.log" $logBase }} +{{- if empty .Values.logPVC }} # If not mounting logs, do not write controller logs to a file +{{- $controllerLogFile = "" }} +{{- end }} +{{- $poolProxyImageTag := .Values.poolProxyImageTag | default .Values.matlabRelease }} +{{- $controllerImageTag := .Values.controllerImageTag | default .Chart.AppVersion }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ $controllerConfigMap }} +data: + {{ $configFileName }}: | + { + "BasePort": {{ $basePort }}, + "CertFileName": {{ $certFileName | quote }}, + "CheckpointBase": {{ $checkpointBase | quote }}, + "CheckpointPVC": {{ .Values.checkpointPVC | quote }}, + "ClusterHost": {{ .Values.clusterHost | quote }}, + "ControllerLogfile": {{ $controllerLogFile | quote }}, + "Debug": {{ .Values.debug | default false }}, + "DeploymentName": {{ $controllerName | quote}}, + "EnableServiceLinks": {{ $enableServiceLinks }}, + "JobManagerUID": {{ $jobManagerUID | quote }}, + "IdleStop": {{ .Values.idleStop }}, + "InternalClientsOnly": {{ .Values.internalClientsOnly }}, + "MatlabImage": {{ $matlabImage | quote }}, + "MatlabImagePullPolicy": {{ .Values.matlabImagePullPolicy | quote }}, + "JobManagerName": {{ .Values.jobManagerName | quote }}, + "JobManagerCPULimit": {{ .Values.jobManagerCPULimit | quote }}, + "JobManagerCPURequest": {{ .Values.jobManagerCPURequest | quote }}, + "JobManagerMemoryLimit": {{ .Values.jobManagerMemoryLimit | quote }}, + "JobManagerMemoryRequest": {{ .Values.jobManagerMemoryRequest | quote }}, + "JobManagerGroupID": {{ .Values.jobManagerGroupID }}, + "JobManagerUserID": {{ .Values.jobManagerUserID }}, + "LivenessProbeFailureThreshold": {{ .Values.livenessProbeFailureThreshold | default 3 }}, + "LivenessProbePeriod": {{ .Values.livenessProbePeriod | default 300 }}, + "LivenessProbeTimeout": {{ .Values.livenessProbeTimeout | default 30 }}, + "LoadBalancerName": {{ $loadBalancerName | quote }}, + "LogBase": {{ $logBase | quote }}, + "LogLevel": {{ .Values.logLevel }}, + "LogPVC": {{ .Values.logPVC | quote }}, + "MatlabPVC": {{ .Values.matlabPVC | default "" | quote }}, + "MatlabRoot": {{ $matlabRoot | quote }}, + "MaxWorkers": {{ .Values.maxWorkers }}, + "MinWorkers": {{ .Values.minWorkers }}, + "MJSDefConfigMap": {{ $mjsConfigMap | quote }}, + "MJSDefDir" : {{ $mjsDefDir | quote }}, + "Namespace": {{ .Release.Namespace | quote }}, + "NetworkLicenseManager": {{ .Values.networkLicenseManager | quote }}, + "OverrideWrapperPhoenix": {{ not .Values.useDefaultWrapperPhoenix }}, + "Period": {{ .Values.autoScalingPeriod }}, + "PortsPerWorker": {{ .Values.portsPerWorker | default 2 }}, + "PoolProxyBasePort": {{ .Values.poolProxyBasePort }}, + "PoolProxyCPULimit": {{ .Values.poolProxyCPULimit | quote }}, + "PoolProxyCPURequest": {{ .Values.poolProxyCPURequest | quote }}, + "PoolProxyImage": {{ printf "%s:%s" .Values.poolProxyImage $poolProxyImageTag | quote }}, + "PoolProxyImagePullPolicy": {{ .Values.poolProxyImagePullPolicy | quote }}, + "PoolProxyMemoryLimit": {{ .Values.poolProxyMemoryLimit | quote }}, + "PoolProxyMemoryRequest": {{ .Values.poolProxyMemoryRequest | quote }}, + "ResizePath": {{ printf "%s/toolbox/parallel/bin/resize" $matlabRoot | quote }}, + "RequireClientCertificate": {{ .Values.requireClientCertificate }}, + "RequireScriptVerification": {{ .Values.requireScriptVerification }}, + "SecretDir": {{ $secretDir | quote }}, + "SecretFileName": {{ $secretFileName | quote }}, + "SecurityLevel": {{ .Values.securityLevel }}, + "StartupProbeFailureThreshold": {{ .Values.startupProbeFailureThreshold | default 60 }}, + "StartupProbeInitialDelay": {{ .Values.startupProbeInitialDelay | default 5 }}, + "StartupProbePeriod": {{ .Values.startupProbePeriod | default 1 }}, + "StopWorkerGracePeriod": {{ .Values.stopWorkerGracePeriod }}, + "WorkerCPULimit": {{ .Values.workerCPULimit | quote }}, + "WorkerCPURequest": {{ .Values.workerCPURequest | quote }}, + "WorkerMemoryLimit": {{ .Values.workerMemoryLimit | quote }}, + "WorkerMemoryRequest": {{ .Values.workerMemoryRequest | quote }}, + "WorkerLogPVC": {{ .Values.workerLogPVC | quote }}, + "WorkerPassword": {{ .Values.workerPassword | quote }}, + "WorkersPerPoolProxy": {{ .Values.workersPerPoolProxy }}, + "WorkerUsername": {{ .Values.workerUsername | quote }}, + "UsePoolProxy": {{ .Values.usePoolProxy | default true }}, + "UseSecureCommunication": {{ .Values.useSecureCommunication }} + } +--- + +# Create the controller deployment +{{- $controllerAccount := printf "%s-account" $controllerName }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $controllerName }} +spec: + selector: + matchLabels: + app: {{ $controllerName }} + replicas: 1 + template: + metadata: + labels: + app: {{ $controllerName }} + spec: + # Controller requires elevated Kubernetes permissions + serviceAccountName: {{ $controllerAccount }} + + # If set to false, disable creation of environment variables for services + enableServiceLinks: {{ $enableServiceLinks }} + + # Define the controller container + containers: + - name: {{ $controllerName }} + image: {{ printf "%s:%s" .Values.controllerImage $controllerImageTag }} + imagePullPolicy: {{ .Values.controllerImagePullPolicy }} + + # The controller process requires the path to a config file as an input argument + # This file is mounted from a ConfigMap (defined in mjs.yaml) + {{- $configMapDir := "/config/" }} + args: + - {{ printf "-config=%s/%s" $configMapDir $configFileName }} + + # Mount the config file from the ConfigMap + volumeMounts: + - name: {{ $configVolName }} + mountPath: {{ $configMapDir }} + + # Store controller logs in the same directory as the job manager logs + {{- if .Values.logPVC }} + - name: {{ $logVolName }} + mountPath: {{ $logBase }} + {{- end }} + + volumes: + - name: {{ $configVolName }} + configMap: + name: {{ $controllerConfigMap }} + {{- if .Values.logPVC }} + - name: {{ $logVolName }} + persistentVolumeClaim: + claimName: {{ .Values.logPVC }} + {{- end }} +--- + +# Create a service account for the controller +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $controllerAccount }} +--- + +# Create a role with permissions to interact with the Kubernetes cluster +{{- $controllerRole := printf "%s-role" $controllerName }} +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ $controllerRole }} +rules: +- apiGroups: [""] + resources: ["pods", "services", "secrets"] + verbs: ["create", "get", "list", "delete", "update"] +- apiGroups: [""] + resources: ["pods/exec"] + verbs: ["create"] +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["create", "get", "list", "delete", "update"] +--- + +# Bind the role to the service account +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ printf "%s-rolebinding" $controllerName }} +subjects: +- kind: ServiceAccount + name: {{ $controllerAccount }} +roleRef: + kind: Role + name: {{ $controllerRole }} + apiGroup: rbac.authorization.k8s.io diff --git a/mjs/templates/no_upgrade.yaml b/mjs/templates/no_upgrade.yaml new file mode 100644 index 0000000..8f9f8b8 --- /dev/null +++ b/mjs/templates/no_upgrade.yaml @@ -0,0 +1,3 @@ +{{- if .Release.IsUpgrade }} + {{- fail (printf "Helm upgrade is not supported for this chart. To change the configuration or chart version of MATLAB Job Scheduler in Kubernetes, run 'helm uninstall %s --namespace %s' to remove the existing Helm release, then install a new Helm release with the values and chart version of your choice." .Release.Name .Release.Namespace) }} +{{- end }} diff --git a/mjs/templates/phoenix.yaml b/mjs/templates/phoenix.yaml new file mode 100644 index 0000000..c0c157b --- /dev/null +++ b/mjs/templates/phoenix.yaml @@ -0,0 +1,387 @@ +{{- if (not .Values.useDefaultWrapperPhoenix) }} +# Define a custom wrapper-phoenix file with configurable heap memory +# Copyright 2024 The MathWorks, Inc. +apiVersion: v1 +kind: ConfigMap +metadata: + name: mjs-phoenix-config +data: + wrapper-phoenix.config: | + #******************************************************************** + # + # Wrapper config file for starting the phoenix RMI daemon + # + # Service Name : phoenixd + # + # Prerequisites for this wrapper config to work are that the following + # environment variables have been set or will be set in the environment + # specific include files listed below : + # + # JRECMD The location of the java executable to call + # + # JREBASE The location of the jre that will be used + # MATBASE The MATLABROOT directory + # MDCEBASE The DISTCOMP toolbox directory + # LOGBASE The directory to log the service stdout + # CHECKPOINTBASE The directory to CHECKPOINT the service + # + # ARCH The architecture we are running on - defines the location of wrapper library + # HOSTNAME The name of the host this service is running on + # + # WORKER_START_TIMEOUT + # + # JOB_MANAGER_HOST + # + # BASE_PORT + # + # The following are used when phoenix creates the service descriptor for + # a job manager. + # DEFAULT_JOB_MANAGER_NAME + # MDCEQE_JOBMANAGER_DEBUG_PORT + # + # The following are used when phoenix creates the service descriptor for + # a worker. + # MATLAB_EXECUTABLE + # DEFAULT_WORKER_NAME + # + # The following are used for security purposes. + # WORKER_DOMAIN (Windows only) + # SECURITY_LEVEL + # SECURITY_DIR + # USE_SECURE_COMMUNICATION + # SHARED_SECRET_FILE + # DEFAULT_KEYSTORE_PATH + # KEYSTORE_PASSWORD + # ALLOW_CLIENT_PASSWORD_CACHE + # ADMIN_USER + # ALLOWED_USERS + # ALLOW_RESIZING + # MAX_CAPACITY + # USE_LDAP_SERVER_AUTHENTICATION + # LDAP_URL + # LDAP_SECURITY_PRINCIPAL_FORMAT + # LDAP_SYNCHRONIZATION_INTERVAL_SECS + # SCHEDULING_ALGORITHM + # SAVE_JOB_HISTORY + # + # The following are used for on-demand operation + # RELEASE_LICENSE_WHEN_IDLE + # + # Copyright 2004-2023 The MathWorks, Inc. + #******************************************************************** + + # Including the following file as defined by an environment variable + # provides a way for us to set environment variables in NT. The + # problem is that we wish a call to service start to pick up any + # changes to mjs_def.bat. To do this we actually write the + # requested environment variables to %MDCE_CONFIG_FILE% + # and simply set the variable MDCE_CONFIG_FILE in the call to wrapper. + # This allows all the required variables to be set by the scripts + # and picked up in this config. To write this include we use + # MATLABROOT/toolbox/parallel/bin/win32/writeconfig.bat + # Currently this facility is NOT used on unix but could be. Also + # note that on windows it is expected that this file will set + # the variable %MDCE_PLATFORM_WRAPPER_CONF% which will be used + # in the next line to source platform specific behaviour + #include %MDCE_CONFIG_FILE% + + # Include the platform specific wrapper configuration file from the + # configuration directory. This environment variable should be set to + # something like %MDCEBASE%/config/wrapper-phoenix-$ARCH.config by the + # setup scripts or the config file + #include %MDCE_PLATFORM_WRAPPER_CONF% + + # Java Application + wrapper.java.command=%JRECMD_FOR_MDCS% + + # All parameters that might have spaces in them must be in double quotes, + # and wrapper.java.additional.X.stripquotes must then also be set to TRUE. + wrapper.java.additional.1=-Dcom.mathworks.toolbox.distcomp.base_port=%BASE_PORT% + + wrapper.java.additional.2="-Dcom.mathworks.toolbox.distcomp.matlabroot=%MATBASE%" + wrapper.java.additional.2.stripquotes=TRUE + + wrapper.java.additional.3="-Dcom.mathworks.toolbox.distcomp.toolboxroot=%MDCEBASE%" + wrapper.java.additional.3.stripquotes=TRUE + + wrapper.java.additional.4="-Dcom.mathworks.toolbox.distcomp.checkpointdir=%CHECKPOINTBASE%" + wrapper.java.additional.4.stripquotes=TRUE + + wrapper.java.additional.5="-Dcom.mathworks.toolbox.distcomp.configbase=%CONFIGBASE%" + wrapper.java.additional.5.stripquotes=TRUE + + wrapper.java.additional.6="-Dcom.mathworks.toolbox.distcomp.mdceDefFile=%MDCE_DEFFILE%" + wrapper.java.additional.6.stripquotes=TRUE + + # Logging + wrapper.java.additional.7="-Dcom.mathworks.toolbox.distcomp.logdir=%LOGBASE%" + wrapper.java.additional.7.stripquotes=TRUE + wrapper.java.additional.8=-Dcom.mathworks.toolbox.distcomp.loglevel=%LOG_LEVEL% + + # Security policy for phoenix + wrapper.java.additional.9="-Djava.security.policy=%MDCEBASE%/config/jsk-all.policy" + wrapper.java.additional.9.stripquotes=TRUE + + # Use urandom as source of entropy + wrapper.java.additional.10=-Djava.security.egd=file:/dev/urandom + + # Hostname + wrapper.java.additional.11="-Dcom.mathworks.toolbox.distcomp.hostname=%HOSTNAME%" + wrapper.java.additional.11.stripquotes=TRUE + + wrapper.java.additional.12="-Djava.rmi.server.hostname=%HOSTNAME%" + wrapper.java.additional.12.stripquotes=TRUE + + # Job manager + wrapper.java.additional.13="-Dcom.mathworks.toolbox.distcomp.default_jobmanager_name=%DEFAULT_JOB_MANAGER_NAME%" + wrapper.java.additional.13.stripquotes=TRUE + wrapper.java.additional.14=-Dcom.mathworks.toolbox.distcomp.job_manager_maximum_memory=%JOB_MANAGER_MAXIMUM_MEMORY% + wrapper.java.additional.15="-Dcom.mathworks.toolbox.distcomp.lookup_hosts=%JOB_MANAGER_HOST%" + wrapper.java.additional.15.stripquotes=TRUE + wrapper.java.additional.16=-Dcom.mathworks.toolbox.distcomp.debug_jobmanager_port=%MDCEQE_JOBMANAGER_DEBUG_PORT% + wrapper.java.additional.17=-Dcom.mathworks.toolbox.distcomp.jobmanager_gcInterval=10000 + + # Workers + wrapper.java.additional.18="-Dcom.mathworks.toolbox.distcomp.matlabexecutable=%MATLAB_EXECUTABLE%" + wrapper.java.additional.18.stripquotes=TRUE + wrapper.java.additional.19=-Dcom.mathworks.toolbox.distcomp.workerstarttimeout=%WORKER_START_TIMEOUT% + wrapper.java.additional.20="-Dcom.mathworks.toolbox.distcomp.default_worker_name=%DEFAULT_WORKER_NAME%" + wrapper.java.additional.20.stripquotes=TRUE + wrapper.java.additional.21=-Dcom.mathworks.toolbox.distcomp.worker_maximum_memory=%WORKER_MAXIMUM_MEMORY% + + # Jini/RMI settings used by the services + wrapper.java.additional.22=-Dcom.mathworks.toolbox.distcomp.membergroups=default_group + wrapper.java.additional.23=-Dcom.mathworks.toolbox.distcomp.RMI_readTimeout=300000 + wrapper.java.additional.24=-Dcom.mathworks.toolbox.distcomp.DNS_lookupInterval=300 + wrapper.java.additional.25=-Dcom.mathworks.toolbox.distcomp.RMI_connectionTimeout=10000 + + # This is the java.library.path used by the services + wrapper.java.additional.26="-Dcom.mathworks.toolbox.distcomp.library_path=%MATBASE%/bin/%ARCH%" + wrapper.java.additional.26.stripquotes=TRUE + + # The JRE flags passed to services + wrapper.java.additional.27="-Dcom.mathworks.toolbox.distcomp.jreflags=%JREFLAGS%" + wrapper.java.additional.27.stripquotes=TRUE + + # Security Level settings + wrapper.java.additional.28=-Dcom.mathworks.toolbox.distcomp.securityLevel=%SECURITY_LEVEL% + wrapper.java.additional.29="-Dcom.mathworks.toolbox.distcomp.securityDir=%SECURITY_DIR%" + wrapper.java.additional.29.stripquotes=TRUE + wrapper.java.additional.30=-Dcom.mathworks.toolbox.distcomp.rmi.useSecureCommunication=%USE_SECURE_COMMUNICATION% + wrapper.java.additional.31=-Dcom.mathworks.toolbox.distcomp.rmi.requireClientCertificate=%REQUIRE_CLIENT_CERTIFICATE% + wrapper.java.additional.32="-Dcom.mathworks.toolbox.distcomp.mjs.security.keystorePath=%SHARED_SECRET_FILE%" + wrapper.java.additional.32.stripquotes=TRUE + wrapper.java.additional.33="-Dcom.mathworks.toolbox.distcomp.mjs.security.defaultKeystorePath=%DEFAULT_KEYSTORE_PATH%" + wrapper.java.additional.33.stripquotes=TRUE + wrapper.java.additional.34=-Dcom.mathworks.toolbox.distcomp.mjs.auth.allowClientPasswordCache=%ALLOW_CLIENT_PASSWORD_CACHE% + wrapper.java.additional.35=-Dcom.mathworks.toolbox.distcomp.mjs.auth.adminUser=%ADMIN_USER% + wrapper.java.additional.36="-Dcom.mathworks.toolbox.distcomp.mjs.auth.allowedUsers=%ALLOWED_USERS%" + wrapper.java.additional.36.stripquotes=TRUE + wrapper.java.additional.37=-Dcom.mathworks.toolbox.distcomp.worker.windowsDomain=%WORKER_DOMAIN% + + # Configure the jobmanager ports + wrapper.java.additional.38=-Dcom.mathworks.toolbox.distcomp.allServerSocketsInCluster=%MDCS_ALL_SERVER_SOCKETS_IN_CLUSTER% + + # Configure the lifecycle reporter and heartbeat intervals + wrapper.java.additional.39=-Dcom.mathworks.toolbox.distcomp.worker.lifecycleReporter=%MDCS_LIFECYCLE_REPORTER% + wrapper.java.additional.40=-Dcom.mathworks.toolbox.distcomp.worker.workerLifecycleHeartBeat=%MDCS_LIFECYCLE_WORKER_HEARTBEAT% + wrapper.java.additional.41=-Dcom.mathworks.toolbox.distcomp.worker.taskLifecycleHeartBeat=%MDCS_LIFECYCLE_TASK_HEARTBEAT% + + # Additional jar files to add to classpath + wrapper.java.additional.42="-Dcom.mathworks.toolbox.distcomp.additionalClasspath=%MDCS_ADDITIONAL_CLASSPATH%" + wrapper.java.additional.42.stripquotes=TRUE + + # Peer Lookup Service configuration + wrapper.java.additional.43=-Dcom.mathworks.toolbox.distcomp.mjs.peerlookupservice.enabled=%MDCS_PEER_LOOKUP_SERVICE_ENABLED% + + # On demand flags + wrapper.java.additional.44=-Dcom.mathworks.toolbox.distcomp.worker.onDemand=%RELEASE_LICENSE_WHEN_IDLE% + + # Web licensing + wrapper.java.additional.45=-Dcom.mathworks.toolbox.distcomp.requireWebLicensing=%MDCS_REQUIRE_WEB_LICENSING% + + wrapper.java.additional.46="-Dcom.mathworks.toolbox.distcomp.jrecmd=%JRECMD_FOR_MDCS%" + wrapper.java.additional.46.stripquotes=TRUE + + # Limit the GC threads + wrapper.java.additional.47=-XX:ParallelGCThreads=6 + + # Send notifications of the state of the job manager queue + wrapper.java.additional.48=-Dcom.mathworks.toolbox.distcomp.sendActivityNotifications=%MDCS_SEND_ACTIVITY_NOTIFICATIONS% + + # Used to define the root directory containing the scripts to call to notify listeners of the job manager's state + wrapper.java.additional.49="-Dcom.mathworks.toolbox.distcomp.scriptRoot=%MDCS_SCRIPT_ROOT%" + wrapper.java.additional.49.stripquotes=TRUE + + # Defines whether workers are used to proxy interactive parallel pool connections + wrapper.java.additional.50=-Dcom.mathworks.toolbox.distcomp.workerProxiesPoolConnections=%MDCS_OPTIMIZED_POOL_BROADCAST% + + # Enables Peer RMI for all job manager communications when duplex peer rmi is enabled + wrapper.java.additional.51=-Dcom.mathworks.toolbox.distcomp.duplexPeerRmiEnabled=%MDCS_DUPLEX_PEER_RMI% + + # Sets the frequency of keep alive messages sent over peer sessions. + wrapper.java.additional.52=-Dcom.mathworks.toolbox.distcomp.pmode.keepAlivePeriod=%MDCS_PEERSESSION_KEEP_ALIVE_PERIOD% + wrapper.java.additional.53=-Dcom.mathworks.toolbox.distcomp.pmode.keepAliveTimeUnit=%MDCS_PEERSESSION_KEEP_ALIVE_TIME_UNIT% + + # Enables MATLAB Drive Path Translation on workers + wrapper.java.additional.54=-Dcom.mathworks.toolbox.distcomp.matlabDriveEnabledOnWorker=%MDCS_MATLAB_DRIVE_ENABLED_ON_WORKER% + wrapper.java.additional.55=-Dcom.mathworks.toolbox.distcomp.matlabDriveFolderLocationCfg=%MW_MATLAB_DRIVE_FOLDER_LOCATION_CFG% + + # Increase the size of the young generation to prevent triggering GC during VM initialization + wrapper.java.additional.56=-XX:NewSize=2304k + + # Use reliable connections + wrapper.java.additional.57=-Dcom.mathworks.toolbox.distcomp.useReliableConnections=%MDCS_USE_RELIABLE_CONNECTIONS% + + # SPF specifically exports services with IPv4. To prevent exporting two services on the same port (one IPv4 the other IPv6) we + # need to tell Java to prefer IPv4 so that our RMI services also export on IPv4. + + + # Use multicast for discovery + wrapper.java.additional.58=-Dcom.mathworks.toolbox.distcomp.mjs.enableMulticast=%MDCS_USE_MULTICAST% + + # Override for the default MATLAB release string used by the MJS services + wrapper.java.additional.59=-Dcom.mathworks.toolbox.distcomp.releaseOverride=%RELEASE_OVERRIDE% + + # Verify mjs commands before executing them + wrapper.java.additional.60=-Dcom.mathworks.toolbox.distcomp.requireScriptVerification=%REQUIRE_SCRIPT_VERIFICATION% + + # Resizing + wrapper.java.additional.61=-Dcom.mathworks.toolbox.distcomp.allowResizingDefault=%ALLOW_RESIZING% + wrapper.java.additional.62=-Dcom.mathworks.toolbox.distcomp.mjs.max_capacity_default=%MAX_CAPACITY% + + # LDAP server authentication + wrapper.java.additional.63=-Dcom.mathworks.toolbox.distcomp.mjs.useLDAPServerAuthenticationDefault=%USE_LDAP_SERVER_AUTHENTICATION% + wrapper.java.additional.64="-Dcom.mathworks.toolbox.distcomp.mjs.ldapURLDefault=%LDAP_URL%" + wrapper.java.additional.64.stripquotes=TRUE + wrapper.java.additional.65="-Dcom.mathworks.toolbox.distcomp.mjs.ldapSecurityPrincipalFormatDefault=%LDAP_SECURITY_PRINCIPAL_FORMAT%" + wrapper.java.additional.65.stripquotes=TRUE + wrapper.java.additional.66=-Dcom.mathworks.toolbox.distcomp.mjs.ldapSynchronizationIntervalSecsDefault=%LDAP_SYNCHRONIZATION_INTERVAL_SECS% + + # Service type to help identify this process + wrapper.java.additional.67=-Dcom.mathworks.toolbox.distcomp.mjs.service.serviceType=phoenix + + # Scheduling algorithm + wrapper.java.additional.68=-Dcom.mathworks.toolbox.distcomp.mjs.schedulingAlgorithmDefault=%SCHEDULING_ALGORITHM% + + wrapper.java.additional.69=-Dcom.mathworks.toolbox.distcomp.mjs.saveJobHistoryDefault=%SAVE_JOB_HISTORY% + + # Uncomment the following to enable debugging for the phoenix CommandListener service + #wrapper.java.additional.70=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=4455 + + # If you add more properties to the list above, you must also update + # the numbers on the properties used in wrapper-phoenix-ARCH.config. + + + # Java Main class. This class must implement the WrapperListener interface + # or guarantee that the WrapperManager class is initialized. Helper + # classes are provided to do this for you. See the Integration section + # of the documentation for details. + wrapper.java.mainclass=org.tanukisoftware.wrapper.WrapperSimpleApp + + # Application parameters. Add parameters as needed starting from 1 + wrapper.app.parameter.1=com.mathworks.toolbox.distcomp.control.PhoenixStarter + wrapper.app.parameter.2="%MDCEBASE%/config/start-phoenix.config" + wrapper.app.parameter.2.stripquotes=TRUE + + # Java Classpath (include wrapper.jar) Add class path elements as + # needed starting from 1. + # Remember that phoenix creates the service descriptors for the worker and + # the job manager, which is the reason for a few of these libraries. + wrapper.java.classpath.1=%JAREXTBASE%/wrapper.jar + wrapper.java.classpath.2=%JINILIB%/start.jar + wrapper.java.classpath.3=%JINILIB%/destroy.jar + wrapper.java.classpath.4=%JINILIB%/phoenix.jar + wrapper.java.classpath.5=%JARBASE%/parallel/util.jar + wrapper.java.classpath.6=%JARBASE%/parallel/pctutil.jar + wrapper.java.classpath.7=%JARBASE%/distcomp.jar + wrapper.java.classpath.8=%JINILIB%/reggie.jar + wrapper.java.classpath.9=%JINILIB%/jini-ext.jar + wrapper.java.classpath.10=%JINILIB%/group.jar + wrapper.java.classpath.11=%JINILIB%/phoenix-init.jar + wrapper.java.classpath.12=%MATBASE%/java/jar/util.jar + wrapper.java.classpath.13=%JARBASE%/parallel/admincenter.jar + wrapper.java.classpath.14=%JAREXTBASEUTIL%/commons-lang.jar + wrapper.java.classpath.15=%JAREXTBASEUTIL%/commons-io.jar + wrapper.java.classpath.16=%JAREXTBASEUTIL%/commons-cli.jar + wrapper.java.classpath.17=%MATBASE%/java/jar/resource_core.jar + wrapper.java.classpath.18=%MATBASE%/java/jar/foundation_libraries.jar + wrapper.java.classpath.19=%MATBASE%/java/jar/resources/parallel_res.jar + wrapper.java.classpath.20=%JAREXTBASEUTIL%/webservices/ws_client_core/mw-service-client-core.jar + wrapper.java.classpath.21=%JAREXTBASEUTIL%/webservices/gds_jobs_client/gds-jobs-client.jar + wrapper.java.classpath.22=%MATBASE%/java/jar/instutil.jar + wrapper.java.classpath.23=%MATBASE%/java/jar/mlwebservices.jar + wrapper.java.classpath.24=%MATBASE%/java/jar/webproxy.jar + wrapper.java.classpath.25=%MATBASE%/java/jar/net.jar + wrapper.java.classpath.26=%MATBASE%/java/jarext/gson.jar + wrapper.java.classpath.27=%MATBASE%/java/jar/jmi.jar + wrapper.java.classpath.28=%MATBASE%/java/jar/mvm.jar + wrapper.java.classpath.29=%MATBASE%/java/jar/services.jar + wrapper.java.classpath.30=%JAREXTBASEUTIL%/jdom2.jar + wrapper.java.classpath.31=%JAREXTBASEUTIL%/jackson/jackson-annotations.jar + wrapper.java.classpath.32=%JAREXTBASEUTIL%/jackson/jackson-core.jar + wrapper.java.classpath.33=%JAREXTBASEUTIL%/jackson/jackson-databind.jar + wrapper.java.classpath.34=%JAREXTBASEUTIL%/jackson/jackson-jaxrs-base.jar + wrapper.java.classpath.35=%JAREXTBASEUTIL%/jackson/jackson-jaxrs-json-provider.jar + wrapper.java.classpath.36=%JAREXTBASEUTIL%/jackson/jackson-module-jaxb-annotations.jar + wrapper.java.classpath.37=%JAREXTBASE%/h2.jar + wrapper.java.classpath.38=%JARBASE%/parallel/keytool.jar + wrapper.java.classpath.39=%JAREXTBASEUTIL%/commons-codec.jar + + # Java Library Path (location of Wrapper.DLL or libwrapper.so) + wrapper.java.library.path.1=%MDCEBASE%/bin/%ARCH% + wrapper.java.library.path.2=%MATBASE%/bin/%ARCH% + + # Initial Java Heap Size (in MB) + wrapper.java.initmemory=3 + + # Maximum Java Heap Size (in MB) + wrapper.java.maxmemory={{ .Values.commandListenerHeapMemory | default 1000 }} + + #******************************************************************** + # Wrapper Logging Properties + #******************************************************************** + # Format of output for the console. (See docs for formats) + wrapper.console.format=PM + + # Log Level for console output. (See docs for log levels) + wrapper.console.loglevel=INFO + + # Log file to use for wrapper output logging. + wrapper.logfile=%LOGBASE%/mjs-service.log + + # File to hold the pid of the wrapper process. This (in reality) + # simply proves that the wrapper process can write a file in the + # checkpoint directory, getting round filesystem issues where we + # don't have write access to this directory. + wrapper.pidfile=%CHECKPOINTBASE%/mjs_writetest.pid + + # Format of output for the log file. (See docs for formats) + wrapper.logfile.format=LPTM + + # Log Level for log file output. (See docs for log levels) + wrapper.logfile.loglevel=INFO + + # Maximum size that the log file will be allowed to grow to before + # the log is rolled. Size is specified in bytes. The default value + # of 0, disables log rolling. May abbreviate with the 'k' (kb) or + # 'm' (mb) suffix. For example: 10m = 10 megabytes. + wrapper.logfile.maxsize=2500k + + # Maximum number of rolled log files which will be allowed before old + # files are deleted. The default value of 0 implies no limit. + wrapper.logfile.maxfiles=4 + + # Log Level for sys/event log output. (See docs for log levels) + wrapper.syslog.loglevel=NONE + + # If false, the wrapper creates a background thread which enters a light weight + # loop and increments an internal "tick" counter. This is expected to make + # spurious timeouts due to high CPU loads very unlikely. + wrapper.use_system_time=false + + # Disable the ping between the wrapper and the JVM so that the wrapper will + # never try to kill and restart the JVM. It also has the effect of disabling + # the JVM monitoring of the wrapper. + wrapper.ping.timeout=0 +{{- end }} diff --git a/mjs/templates/requirements.yaml b/mjs/templates/requirements.yaml new file mode 100644 index 0000000..774bbde --- /dev/null +++ b/mjs/templates/requirements.yaml @@ -0,0 +1,19 @@ +# Copyright 2024 The MathWorks, Inc. + +{{- required "Specify a value for the maximum number of MATLAB Job Scheduler workers to start using the maxWorkers parameter." .Values.maxWorkers }} + +{{- if not .Values.matlabImage }} +{{- required "Specify either the URI of a Docker image containing a MATLAB Parallel Server installation using the matlabImage parameter or the name of a Persistent Volume Claim that contains a MATLAB Parallel Server installation using the matlabPVC parameter." .Values.matlabPVC }} +{{- end }} + +{{- if not (hasKey .Values "checkpointPVC") }} + {{- required "Specify the name of a Persistent Volume Claim to store persistent job data using the checkpointPVC parameter. Set this parameter to an empty string if you do not want to persist job data, but be aware that you may lose job and task data unexpectedly between job manager restarts." .Values.checkpointPVC }} +{{- end }} + +{{- if not (hasKey .Values "logPVC") }} + {{- required "Specify the name of a Persistent Volume Claim to store job manager logs using the logPVC parameter. Set this parameter to an empty string if you do not want to persist job manager logs between job manager restarts." .Values.logPVC }} +{{- end }} + +{{- if not (hasKey .Values "workerLogPVC") }} + {{- required "Specify the name of a Persistent Volume Claim to store worker logs using the workerLogPVC parameter. Set this parameter to an empty string if you do not want to persist worker logs between worker restarts." .Values.workerLogPVC }} +{{- end }} diff --git a/mjs/values.yaml b/mjs/values.yaml new file mode 100644 index 0000000..ff1323d --- /dev/null +++ b/mjs/values.yaml @@ -0,0 +1,78 @@ +# Default values for MATLAB Job Scheduler (MJS) in Kubernetes. +# Copyright 2024 The MathWorks, Inc. + +# Image containing MATLAB Parallel Server +matlabImage: "" # If this field is unset, the mathworks/matlab-deps image is used by default +matlabImagePullPolicy: "IfNotPresent" +matlabRelease: "r2024a" # Release number of the MATLAB version to use + +# Image containing the MJS in Kubernetes controller +controllerImage: "ghcr.io/mathworks-ref-arch/matlab-parallel-server-k8s/mjs-controller-image" +controllerImageTag: "" +controllerImagePullPolicy: "IfNotPresent" + +# Image containing HAproxy +haproxyImage: "haproxy" +haproxyImagePullPolicy: "IfNotPresent" + +# Image containing the pool proxy +poolProxyImage: "containers.mathworks.com/matlab-parallel-server-k8s/parallel-server-proxy-image" +poolProxyImageTag: "" +poolProxyImagePullPolicy: "IfNotPresent" + +# MJS settings +minWorkers: 0 # Minimum number of workers to resize the cluster to +logLevel: 0 # Logging verbosity level +basePort: 27350 # The base port of the MJS service +jobManagerName: "MJS_Kubernetes" # Name of the MJS job manager +securityLevel: 2 # Level of security for the cluster +useSecureCommunication: true # If true, use secure communication between services +requireClientCertificate: true # If true, require clients to have a certificate to connect to the job manager +requireScriptVerification: true # If true, require verification for privileged commands sent to the cluster +clusterHost: "" # Custom host to use in the cluster profile. If unset, the cluster profile uses the external address of the load balancer + +# Resource requests and limits +workerCPULimit: "" # CPU limit for each worker process +workerCPURequest: 2 # CPU request for each worker process +workerMemoryLimit: "8Gi" # Memory limit for each worker process +workerMemoryRequest: "8Gi" # Memory request for each worker process +jobManagerCPULimit: "" # CPU limit for the job manager +jobManagerCPURequest: 1 # CPU request for the job manager +jobManagerMemoryLimit: "" # Memory limit for the job manager +jobManagerMemoryRequest: "4Gi" # Memory request for the job manager +poolProxyCPULimit: "" # CPU limit for each parallel pool proxy process +poolProxyCPURequest: "0.5" # CPU request for each parallel pool proxy process +poolProxyMemoryLimit: "" # Memory limit for each parallel pool proxy process +poolProxyMemoryRequest: "500Mi" # Memory request for each parallel pool proxy process + +# Auto-scaling settings +idleStop: 300 # Time after which idle worker pods will be removed +autoScalingPeriod: 15 # Period with which the controller checks the cluster's size requirements +stopWorkerGracePeriod: 60 # Grace period in seconds for running stopworker + +# Network settings +autoCreateLoadBalancer: true # Flag to automatically create a Kubernetes load balancer to expose MATLAB Job Scheduler to MATLAB clients outside the cluster + +# Parallel pool proxy settings +poolProxyBasePort: 30000 # Base port for parallel pool proxies +workersPerPoolProxy: 32 # Maximum number of workers per parallel pool proxy process + +# Security settings +workerUsername: matlab # Username that MATLAB Parallel Server uses to run jobs +workerPassword: matlab # Password of the username that MATLAB Parallel Server uses to run jobs +jobManagerUserID: 0 # ID of the user to run the job manager pod as +jobManagerGroupID: 0 # Group ID of the user to run the job manager pod as +internalClientsOnly: false # Flag to allow only MATLAB clients running inside the Kubernetes cluster to connect to the MATLAB Job Scheduler + +# Licensing settings +networkLicenseManager: "" # Address of a network license manager with format port@host +useOnlineLicensing: false # Set to true to use Online Licensing + +# Specify the maximum number of workers that the cluster can automatically resize to in your custom values.yaml file. +# maxWorkers: 32 + +# Specify the names of the Persistent Volume Claims (PVCs) for these parameters in your custom values.yaml file. +# matlabPVC: "matlab-pvc" # Name of a PVC that contains a MATLAB Parallel Server installation +# checkpointPVC: "checkpoint-pvc" # Name of a PVC where MATLAB Parallel Server stores persistent data related to the job manager +# logPVC: "log-pvc" # Name of a PVC where MATLAB Parallel Server stores job manager logs +# workerLogPVC: "worker-log-pvc" # Name of a PVC where MATLAB Parallel Server stores worker logs