Skip to content

Commit

Permalink
Version 1.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
hannahpullen committed Nov 14, 2024
1 parent 4a2cab6 commit 8e4192e
Show file tree
Hide file tree
Showing 16 changed files with 296 additions and 28 deletions.
4 changes: 2 additions & 2 deletions chart/mjs/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ apiVersion: v2
name: mjs
description: A Helm chart for MATLAB (R) Job Scheduler in Kubernetes
type: application
version: 1.3.0
appVersion: 1.3.0
version: 1.4.0
appVersion: 1.4.0
133 changes: 133 additions & 0 deletions chart/mjs/files/workergroup.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import com.mathworks.toolbox.distcomp.mjs.service.ConfigUtil;
import com.mathworks.toolbox.distcomp.control.PortConfig;
import com.mathworks.util.PlatformInfo;
import com.mathworks.toolbox.parallel.pctutil.logging.DistcompFileHandler;
import com.mathworks.toolbox.parallel.pctutil.logging.DistcompSimpleFormatter;
import com.mathworks.toolbox.parallel.pctutil.logging.DistcompLevel;

/* Copyright 2018-2024 The MathWorks, Inc. */

// IMPORTANT!!!: Please note that the system properties referenced here actually get set
// in start-workergroup.config file, are read by the service constructor on STDIN and then are
// set using the java.lang.System.setProperty method before the service makes use of the
// config file. This is because MATLAB does not accept -D arguments. In order to see a
// property here, you need to add them to the String[] that is passed as the 6th argument
// to the SharedActivatableServiceDescriptor in start-workergroup.config

com.mathworks.toolbox.distcomp.workergroup {
private static configDir = "${com.mathworks.toolbox.distcomp.toolboxroot}${/}config${/}";

persistenceDirectory = "${com.mathworks.toolbox.distcomp.persistenceDir}";
logDirectory = "${com.mathworks.toolbox.distcomp.logdir}";
serviceName = "${com.mathworks.toolbox.distcomp.servicename}";

lookupServiceQueryIntervalSecs = 30;
maxWaitBeforeShutdownSecs = 10;

codebase = "";
policy = ConfigUtil.concat(new String[]{configDir,"jsk-all.policy"});
minServiceExportPort = PortConfig.getMinDistcompServiceExportPort("${com.mathworks.toolbox.distcomp.base_port}");

// Each worker needs 2 ports, so need a large port range to accommodate
// machines running many workers. This range will allow approximately 2000
// workers.
maxNumPorts = 4000;

defaultRmiClientConnectTimeoutSecs = 5;

logLevel = "${com.mathworks.toolbox.distcomp.loglevel}";

onDemand = "${com.mathworks.toolbox.distcomp.worker.onDemand}";
idleKillTimeoutInSeconds = 5L;
idleLicenseTimeoutInSeconds = 0L; // Immediately return licenses every time

// Messages logged via LOGGER.log() in the Worker JVM will end up in this file.
serviceLogFilePattern = ConfigUtil.concat(new String[]{
"${com.mathworks.toolbox.distcomp.logdir}${/}",
"workergroup_",
serviceName,
".%u.%g", //unique number and sequence number
".log"});
serviceHandler = new DistcompFileHandler(serviceLogFilePattern,
true,
DistcompLevel.getLevelFromValue(Integer.parseInt(logLevel)),
10,
100000000,
new DistcompSimpleFormatter());

securityLevel = "${com.mathworks.toolbox.distcomp.securityLevel}";
securityDir = "${com.mathworks.toolbox.distcomp.securityDir}";

useSecureCommunication = Boolean.parseBoolean("${com.mathworks.toolbox.distcomp.rmi.useSecureCommunication}");

requireWebLicensing = Boolean.parseBoolean("${com.mathworks.toolbox.distcomp.requireWebLicensing}");

requireClientCertificate = Boolean.parseBoolean("${com.mathworks.toolbox.distcomp.rmi.requireClientCertificate}");

taskEvaluator = "com.mathworks.toolbox.distcomp.mjs.worker.matlab.VersionSwitchingTaskEvaluator";

matlabroot = "${com.mathworks.toolbox.distcomp.matlabroot}";

// These properties control how the Worker starts MATLAB.
matlabExecutable = "${com.mathworks.toolbox.distcomp.matlabexecutable}";

// Use -noFigureWindows on Windows and -nodisplay everywhere else
private static nodisplay = ConfigUtil.ifThenElse(
/*if*/ PlatformInfo.isWindows(),
/*then*/ "-noFigureWindows",
/*else*/ "-nodisplay"
);


// Can add other command line arguments: e.g -timing or -jdb
// In addition to these arguments, the PID of the Java worker service is automatically
// added as an argument to the initworker.m script
matlabArguments = new String[]{"-parallelserver", nodisplay, "-r", "initworker"};
// Command line arguments for worker MATLABs from 18b and earlier
matlabArgumentsDmlWorker = new String[]{"-dmlworker", nodisplay, "-r", "initworker"};
// Command line arguments for worker MATLABs launched with MVM from 19b
matlabArgumentsMvm = new String[]{"-parallelserver", nodisplay};

// Time to wait between polls on the MVM for whether it is still running
mvmPollIntervalMillis = 1000L;

// Can be used to instrument the workers for debugging, with e.g. strace or gdb.
// Will be placed in front of the MATLAB executable on the command line
workerInstrumentation = new String[]{};

// Whether to launch each worker with "mpiexec -n 1 -laucher fork"
// This is required for mpich3, but should be removed for other mpi
// implementations
shouldLaunchWithMpiexec = false;

// MATLAB stdout and stderr streams are written to this file.
matlabOutputMaxTotalSize = 1000000000; // 1GB
matlabOutputMaxNumFiles = 10;

matlabEnvironment = new String[]{
"HOSTNAME", "${com.mathworks.toolbox.distcomp.hostname}",
"BASE_PORT", "${com.mathworks.toolbox.distcomp.base_port}",
"USE_SERVER_SPECIFIED_HOSTNAME", "${com.mathworks.toolbox.distcomp.rmi.useServerSpecifiedHostname}",
"MDCS_PEERSESSION_KEEP_ALIVE_PERIOD", "${com.mathworks.toolbox.distcomp.pmode.keepAlivePeriod}",
"MDCS_PEERSESSION_KEEP_ALIVE_TIME_UNIT", "${com.mathworks.toolbox.distcomp.pmode.keepAliveTimeUnit}",
"MDCS_MAX_NUM_PORTS", Integer.toString(maxNumPorts),
"MDCS_MATLAB_DRIVE_ENABLED_ON_WORKER", "${com.mathworks.toolbox.distcomp.matlabDriveEnabledOnWorker}",
"MJS_IDLE_LICENSE_TIMEOUT_SECS", Long.toString(idleLicenseTimeoutInSeconds),
"MW_MATLAB_DRIVE_FOLDER_LOCATION_CFG", "${com.mathworks.toolbox.distcomp.matlabDriveFolderLocationCfg}"};

// Other useful variables to set for debugging purposes are:
//"PCTIPC_VERBOSE", "DEBUG4",
//"PCTIPC_LOGFILE", ConfigUtil.concat("/tmp/pctipc_", serviceName, ".log")

// Set by mjs_def "WORKER_START_TIMEOUT" property
matlabStartupTimeoutSecs = Long.parseLong("${com.mathworks.toolbox.distcomp.workerstarttimeout}");

// Maximum time to wait for a clean MATLAB shutdown before the process is hard-killed
matlabShutdownTimeoutSecs = 60;

windowsDomain = "${com.mathworks.toolbox.distcomp.worker.windowsDomain}";

// By default do not use an activatable exporter.
useActivatableExporter = false;
}

5 changes: 5 additions & 0 deletions chart/mjs/templates/_derived.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,9 @@
# If we are using a secure LDAP server and not using a persistent volume claim for the job manager pod, we need to add the LDAP certificate to the job manager's secret store
{{- define "derived.addLDAPCert" -}}
{{ and (hasPrefix "ldaps://" .Values.ldapURL) (or (empty .Values.matlabPVC) (not .Values.jobManagerUsesPVC)) }}
{{- end -}}

# Whether to override the workergroup config file
{{- define "derived.overrideWorkergroupConfig" -}}
{{ and (eq .Values.matlabRelease "r2024b") (not (empty .Values.networkLicenseManager)) }}
{{- end -}}
15 changes: 7 additions & 8 deletions chart/mjs/templates/controller-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,7 @@ data:
"DeploymentName": {{ include "resources.controller" . | quote}},
"EnableServiceLinks": {{ include "derived.enableServiceLinks" . }},
"OpenMetricsPortOutsideKubernetes": {{ .Values.openMetricsPortOutsideKubernetes }},
"ExtraWorkerEnvironment": {
{{- $comma := "" }}
{{- range $key, $value := .Values.extraWorkerEnv }}
{{ $comma }}{{ $key | quote }}: {{ $value | quote }}
{{- $comma = "," }}
{{- end }}
},
"ExtraWorkerEnvironment": {{ toJson (.Values.extraWorkerEnv | default dict) }},
"IdleStop": {{ .Values.idleStop }},
"InternalClientsOnly": {{ .Values.internalClientsOnly }},
"JobManagerImage": {{ printf "%s:%s" $jobManagerImage $jobManagerImageTag | quote }},
Expand All @@ -51,9 +45,11 @@ data:
"JobManagerMemoryLimit": {{ .Values.jobManagerMemoryLimit | quote }},
"JobManagerMemoryRequest": {{ .Values.jobManagerMemoryRequest | quote }},
"JobManagerGroupID": {{ .Values.jobManagerGroupID }},
"JobManagerNodeSelector": {{ toJson .Values.jobManagerNodeSelector }},
"JobManagerUserID": {{ .Values.jobManagerUserID }},
"JobManagerUsesPVC": {{ .Values.jobManagerUsesPVC }},
"JobManagerUID": {{ uuidv4 | quote }},
{{ if eq (include "derived.addLDAPCert" .) "true" -}}
{{- if eq (include "derived.addLDAPCert" .) "true" -}}
"LDAPCertPath": {{ include "paths.ldapCert" . | quote }},
{{- end }}
"LivenessProbeFailureThreshold": {{ .Values.livenessProbeFailureThreshold | default 3 }},
Expand All @@ -72,7 +68,9 @@ data:
"MJSDefDir" : {{ include "paths.configDir" . | quote }},
"Namespace": {{ .Release.Namespace | quote }},
"NetworkLicenseManager": {{ .Values.networkLicenseManager | quote }},
"OverrideWorkergroupConfig": {{ include "derived.overrideWorkergroupConfig" . }},
"Period": {{ .Values.autoScalingPeriod }},
"PreserveSecrets": {{ .Values.preserveSecrets | default false }},
"PortsPerWorker": {{ .Values.portsPerWorker | default 2 }},
"PoolProxyBasePort": {{ .Values.poolProxyBasePort }},
"PoolProxyCPULimit": {{ .Values.poolProxyCPULimit | quote }},
Expand All @@ -98,6 +96,7 @@ data:
"WorkerMemoryLimit": {{ .Values.workerMemoryLimit | quote }},
"WorkerMemoryRequest": {{ .Values.workerMemoryRequest | quote }},
"WorkerLogPVC": {{ .Values.workerLogPVC | quote }},
"WorkerNodeSelector": {{ toJson .Values.workerNodeSelector }},
"WorkerPassword": {{ .Values.workerPassword | quote }},
"WorkersPerPoolProxy": {{ .Values.workersPerPoolProxy }},
"WorkerUsername": {{ .Values.workerUsername | quote }},
Expand Down
3 changes: 3 additions & 0 deletions chart/mjs/templates/controller-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ spec:
# If set to false, disable creation of environment variables for services
enableServiceLinks: {{ include "derived.enableServiceLinks" . }}

# Schedule on same nodes as the job manager
nodeSelector: {{ toJson .Values.jobManagerNodeSelector }}

containers:
- name: {{ $name }}
image: {{ printf "%s:%s" .Values.controllerImage $controllerImageTag }}
Expand Down
3 changes: 3 additions & 0 deletions chart/mjs/templates/ingress-proxy-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ spec:
# If set to false, disable creation of environment variables for services
enableServiceLinks: {{ include "derived.enableServiceLinks" . }}

# Schedule on same nodes as the job manager
nodeSelector: {{ toJson .Values.jobManagerNodeSelector }}

containers:
- name: haproxy
image: {{ $.Values.haproxyImage }}
Expand Down
11 changes: 11 additions & 0 deletions chart/mjs/templates/worker-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{{- if eq (include "derived.overrideWorkergroupConfig" .) "true" -}}
# Config files for the MJS workers.
# Copyright 2024 The MathWorks, Inc.
apiVersion: v1
kind: ConfigMap
metadata:
name: mjs-worker-config
data:
workergroup.config: |
{{- .Files.Get "files/workergroup.config" | nindent 4 }}
{{- end -}}
15 changes: 15 additions & 0 deletions chart/mjs/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@
"jobManagerName": {
"type": "string"
},
"jobManagerNodeSelector": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"jobManagerUserID": {
"type": "integer",
"minimum": 0
Expand Down Expand Up @@ -181,6 +187,9 @@
"openMetricsPortOutsideKubernetes": {
"type": "boolean"
},
"preserveSecrets": {
"type": "boolean"
},
"poolProxyBasePort": {
"type": "integer",
"minimum": 1024,
Expand Down Expand Up @@ -286,6 +295,12 @@
{ "type": "number" }
]
},
"workerNodeSelector": {
"type": "object",
"additionalProperties": {
"type": "string"
}
},
"workerPassword": {
"type": "string"
},
Expand Down
4 changes: 4 additions & 0 deletions chart/mjs/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ poolProxyCPURequest: "0.5" # CPU request for each parallel pool proxy process
poolProxyMemoryLimit: "" # Memory limit for each parallel pool proxy process
poolProxyMemoryRequest: "500Mi" # Memory request for each parallel pool proxy process

# Node settings
jobManagerNodeSelector: {} # Node selector for the job manager, specified as key-value pairs
workerNodeSelector: {} # Node selector for the workers, specified as key-value pairs

# Auto-scaling settings
idleStop: 300 # Time after which idle worker pods will be removed
autoScalingPeriod: 15 # Period with which the controller checks the cluster's size requirements
Expand Down
2 changes: 2 additions & 0 deletions helm_values.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ If you do not include a parameter in your YAML file, your configuration uses the
`jobManagerMemoryLimit` | Memory limit for the job manager pod. | —
`jobManagerMemoryRequest` | Memory request for the job manager pod. | `4Gi`
`jobManagerName` | Name of the MATLAB Job Scheduler job manager. | `MJS_Kubernetes`
`jobManagerNodeSelector` | Node selector for the job manager pod, specified as key-value pairs that match the labels of the Kubernetes nodes you want to run the job manager on. For example, to run the job manager on nodes with label `node-type=jobmanager`, set this parameter to `{"node-type":"jobmanager"}`. You must assign the appropriate labels to your nodes before you can use the `nodeSelector` feature. For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node) on the Kubernetes website. | `{}`
`jobManagerUserID` | User ID of the user account that MATLAB Job Scheduler uses to run the job manager pod. The user must have write permission for the checkpoint and log PersistentVolumes. To find the user ID, on a Linux machine, run `id -u` in the terminal. | `0`
`jobManagerUsesPVC` | Flag to mount a MATLAB Parallel Server installation from a PersistentVolume onto the job manager pod if the `matlabPVC` parameter is set. If this flag is set to true, the job manager pod uses the image specified in the `matlabDepsImage` parameter. | `false`
`ldapSecurityPrincipalFormat` | Format of a security principal (user) for your LDAP server. | —
Expand Down Expand Up @@ -68,6 +69,7 @@ If you do not include a parameter in your YAML file, your configuration uses the
`workerLogPVC` | Name of the PersistentVolumeClaim that is bound to the PersistentVolume used to retain worker logs. | —
`workerMemoryLimit` | Memory limit for each worker pod. | `8Gi`
`workerMemoryRequest` | Memory request for each worker pod. | `8Gi`
`workerNodeSelector` | Node selector for the worker pods, specified as key-value pairs that match the labels of the Kubernetes nodes you want to run the workers on. For example, to run the workers on nodes with label `node-type=worker`, set this parameter to `{"node-type":"worker"}`. You must assign the appropriate labels to your nodes before you can use the `nodeSelector` feature. For more information, see [Assigning Pods to Nodes](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node) on the Kubernetes website. | `{}`
`workerPassword` | Password of the username that MATLAB Parallel Server uses to run jobs. | `matlab`
`workerUsername` | Username that MATLAB Parallel Server uses to run jobs. | `matlab`
`workersPerPoolProxy` | Maximum number of workers using each parallel pool proxy. | `32`
Expand Down
5 changes: 5 additions & 0 deletions images/controller/src/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ type Config struct {
JobManagerImagePullPolicy string
JobManagerMemoryLimit string
JobManagerMemoryRequest string
JobManagerNodeSelector map[string]string
JobManagerGroupID int64
JobManagerUserID int64
JobManagerUsesPVC bool
KubeConfig string
LDAPCertPath string
LivenessProbeFailureThreshold int32
Expand All @@ -54,6 +56,7 @@ type Config struct {
Namespace string
NetworkLicenseManager string
OpenMetricsPortOutsideKubernetes bool
OverrideWorkergroupConfig bool
Period int
PortsPerWorker int
PoolProxyBasePort int
Expand All @@ -63,6 +66,7 @@ type Config struct {
PoolProxyImagePullPolicy string
PoolProxyMemoryLimit string
PoolProxyMemoryRequest string
PreserveSecrets bool
ReadyFile string
ResizePath string
RequireClientCertificate bool
Expand All @@ -81,6 +85,7 @@ type Config struct {
WorkerLogPVC string
WorkerMemoryRequest string
WorkerMemoryLimit string
WorkerNodeSelector map[string]string
WorkerPassword string
WorkersPerPoolProxy int
WorkerUsername string
Expand Down
8 changes: 4 additions & 4 deletions images/controller/src/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ func (c *Controller) createSharedSecret() (*certificate.SharedSecret, error) {
}

// Get spec for Kubernetes secret
secretSpec := c.specFactory.GetSecretSpec(specs.SharedSecretName)
secretSpec := c.specFactory.GetSecretSpec(specs.SharedSecretName, c.config.PreserveSecrets)
secretSpec.Data[c.config.SecretFileName] = secretBytes

// Generate a certificate if needed
Expand Down Expand Up @@ -332,7 +332,7 @@ func (c *Controller) createCertsForMetrics() error {
if err != nil {
return err
}
secretSpec := c.specFactory.GetSecretSpec(specs.MetricsSecretName)
secretSpec := c.specFactory.GetSecretSpec(specs.MetricsSecretName, c.config.PreserveSecrets)
secretSpec.Data[specs.MetricsCAFileName] = []byte(serverCert.ServerCert)
secretSpec.Data[specs.MetricsCertFileName] = []byte(serverCert.ClientCert)
secretSpec.Data[specs.MetricsKeyFileName] = []byte(serverCert.ClientKey)
Expand All @@ -347,7 +347,7 @@ func (c *Controller) createCertsForMetrics() error {
if err != nil {
return err
}
clientSecretSpec := c.specFactory.GetSecretSpec(clientMetricsCertSecret)
clientSecretSpec := c.specFactory.GetSecretSpec(clientMetricsCertSecret, c.config.PreserveSecrets)
clientSecretSpec.Data[specs.MetricsCAFileName] = []byte(clientCert.ServerCert)
clientSecretSpec.Data[clientCertFilename] = []byte(clientCert.ClientCert)
clientSecretSpec.Data[clientKeyFilename] = []byte(clientCert.ClientKey)
Expand Down Expand Up @@ -430,7 +430,7 @@ func (c *Controller) createProfile(sharedSecret *certificate.SharedSecret) error
}

// Create Kubernetes secret for profile
secret := c.specFactory.GetSecretSpec(profileSecretName)
secret := c.specFactory.GetSecretSpec(profileSecretName, c.config.PreserveSecrets)
secret.Data[profileKey] = profBytes
_, err = c.client.CreateSecret(secret)
if err != nil {
Expand Down
Loading

0 comments on commit 8e4192e

Please sign in to comment.