Merge pull request #22 from jamesbeedy/remove_slurm_ops_manager

Remove slurm ops manager
charmed-hpc · Feb 17, 2024 · 81d201d · 81d201d
2 parents 6e6b55f + 49cedc3
commit 81d201d
Show file tree

Hide file tree

Showing 16 changed files with 2,698 additions and 211 deletions.
diff --git a/actions.yaml b/actions.yaml
diff --git a/charmcraft.yaml b/charmcraft.yaml
@@ -1,5 +1,52 @@
 # Copyright 2020 Omnivector Solutions, LLC
 # See LICENSE file for licensing details.
+name: slurmctld
+summary: |
+  Slurmctld, the central management daemon of Slurm.
+description: |
+  This charm provides slurmctld, munged, and the bindings to other utilities
+  that make lifecycle operations a breeze.
+
+  slurmctld is the central management daemon of SLURM. It monitors all other
+  SLURM daemons and resources, accepts work (jobs), and allocates resources
+  to those jobs.  Given the critical functionality of slurmctld, there may be
+  a backup server to assume these functions in the event that the primary
+  server fails.
+
+links:
+  contact: https://matrix.to/#/#hpc:ubuntu.com
+
+  issues: 
+  - https://github.com/charmed-hpc/slurmctld-operator/issues
+
+  source:
+  - https://github.com/charmed-hpc/slurmctld-operator
+
+peers:
+  slurmctld-peer:
+    interface: slurmctld-peer
+requires:
+  slurmd:
+    interface: slurmd
+  slurmdbd:
+    interface: slurmdbd
+  slurmrestd:
+    interface: slurmrestd
+  influxdb-api:
+    interface: influxdb-api
+  elasticsearch:
+    interface: elasticsearch
+  fluentbit:
+    interface: fluentbit
+provides:
+  prolog-epilog:
+    interface: prolog-epilog
+  grafana-source:
+    interface: grafana-source
+    scope: global
+
+assumes:
+  - juju
 
 type: charm
 bases:
@@ -29,3 +76,145 @@ parts:
       echo $VERSION > $CRAFT_PART_INSTALL/version
     stage:
       - version
+
+config:
+  options:
+    custom-slurm-repo:
+      type: string
+      default: ""
+      description: >
+        Use a custom repository for Slurm installation.
+  
+        This can be set to the Organization's local mirror/cache of packages and
+        supersedes the Omnivector repositories. Alternatively, it can be used to
+        track a `testing` Slurm version, e.g. by setting to
+        `ppa:omnivector/osd-testing`.
+
+        Note: The configuration `custom-slurm-repo` must be set *before*
+        deploying the units. Changing this value after deploying the units will
+        not reinstall Slurm.
+    cluster-name:
+      type: string
+      default: osd-cluster
+      description: >
+        Name to be recorded in database for jobs from this cluster.
+  
+        This is important if a single database is used to record information from
+        multiple Slurm-managed clusters.
+    default-partition:
+      type: string
+      default: ""
+      description: >
+        Default Slurm partition. This is only used if defined, and must match an
+        existing partition.
+    custom-config:
+      type: string
+      default: ""
+      description: >
+        User supplied Slurm configuration.
+  
+        This value supplements the charm supplied `slurm.conf` that is used for
+        Slurm Controller and Compute nodes.
+
+        Example usage:
+        $ juju config slurmcltd custom-config="FirstJobId=1234"
+    proctrack-type:
+      type: string
+      default: proctrack/cgroup
+      description: >
+        Identifies the plugin to be used for process tracking on a job step
+        basis.
+    cgroup-config:
+      type: string
+      default: |
+        CgroupAutomount=yes
+        ConstrainCores=yes
+      description: >
+        Configuration content for `cgroup.conf`.
+  
+    health-check-params:
+      default: ""
+      type: string
+      description: >
+        Extra parameters for NHC command.
+  
+        This option can be used to customize how NHC is called, e.g. to send an
+        e-mail to an admin when NHC detects an error set this value to
+        `-M [email protected]`.
+    health-check-interval:
+      default: 600
+      type: int
+      description: Interval in seconds between executions of the Health Check.
+    health-check-state:
+      default: "ANY,CYCLE"
+      type: string
+      description: Only run the Health Check on nodes in this state.
+
+    acct-gather-frequency:
+      type: string
+      default: "task=30"
+      description: >
+        Accounting and profiling sampling intervals for the acct_gather plugins.
+  
+        Note: A value of `0` disables the periodic sampling. In this case, the
+        accounting information is collected when the job terminates.
+
+        Example usage:
+        $ juju config slurmcltd acct-gather-frequency="task=30,network=30"
+    acct-gather-custom:
+      type: string
+      default: ""
+      description: >
+        User supplied `acct_gather.conf` configuration.
+  
+        This value supplements the charm supplied `acct_gather.conf` file that is
+        used for configuring the acct_gather plugins.
+
+actions:
+  show-current-config:
+    description: >
+      Display the currently used `slurm.conf`.
+  
+      Note: This file only exists in `slurmctld` charm and is automatically
+      distributed to all compute nodes by Slurm.
+
+      Example usage:
+      $ juju run-action slurmctld/leader --format=json --wait | jq .[].results.slurm.conf | xargs -I % -0 python3 -c 'print(%)'
+  drain:
+    description: >
+      Drain specified nodes.
+  
+      Example usage:
+      $ juju run-action slurmctld/leader drain nodename=node-[1,2] reason="Updating kernel"
+    params:
+      nodename:
+        type: string
+        description: The nodes to drain, using the Slurm format, e.g. `node-[1,2]`.
+      reason:
+        type: string
+        description: Reason to drain the nodes.
+    required:
+      - nodename
+      - reason
+  resume:
+    description: >
+      Resume specified nodes.
+  
+      Note: Newly added nodes will remain in the `down` state until configured,
+      with the `node-configured` action.
+
+      Example usage: $ juju run-action slurmctld/leader resume nodename=node-[1,2]
+    params:
+      nodename:
+        type: string
+        description: >
+          The nodes to resume, using the Slurm format, e.g. `node-[1,2]`.
+    required:
+      - nodename
+
+  influxdb-info:
+    description: >
+      Get InfluxDB info.
+  
+      This action returns the host, port, username, password, database, and
+      retention policy regarding to InfluxDB.
diff --git a/config.yaml b/config.yaml