diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 212fba6..a18826b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: strategy: matrix: - distro: [ubuntu1604, ubuntu1804] + distro: [ubuntu1604, ubuntu1804, ubuntu2004] fail-fast: false runs-on: ubuntu-latest diff --git a/README.md b/README.md index f826ebf..5c7fa37 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,49 @@ # Ansible Role: marvel-nccr.slurm -An Ansible role that installs the [slurm](https://slurm.schedmd.com/) workload manager on Ubuntu. +An Ansible role that installs the [slurm](https://slurm.schedmd.com/) workload manager on Ubuntu (tested on 16.04, 18.04 and 20.04). + +The role: + +- Installs the slurm packages +- Sets the hostname to be that defined in `slurm_hostname` +- If `slurm_hostname_service: true` adds a service to set the hostname on VM start-up (required for cloud platforms) +- Sets up the slurm configuration (`/etc/slurm-llnl/slurm.conf`) to dynamically use the correct platform resources (#CPUs, etc), configuring one node and one partition. +- Adds a `slurm-resources` script and start-up service to automate the initiation of correct platform resources (required if creating a VM instance with different resources to the build VM image) +- Starts the slurm services. + +To check the services are running (assuming systemd in use): + +```console +$ systemctl --type=service +... +slurmctld.service loaded active running Slurm controller daemon +slurmd.service loaded active running Slurm node daemon +... +``` + +To check the slurm node/partition: + +```console +$ scontrol show node +$ scontrol show partition +``` + +This should match the resources given in `lscpu`. + +To enable/disable the `slurm-resources` start up service: + +```console +$ systemctl enable slurm-resources +``` + +To alter the resources configuration of slurm directly, you can use e.g.: + +```console +$ slurm-resources -e restart_on_change=true -e slurm_max_cpus=2 +``` + +This will update the resources defined for the node, set the maximum CPUs for the partition to 2 (independent of the CPUs available on the node), and restart the slurm services with the updated configuration (if the configuration has changed). **NOTE!** It is important that the hostname is properly set in the machine diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml index 5dc51f4..71b79e1 100644 --- a/molecule/default/verify.yml +++ b/molecule/default/verify.yml @@ -1,6 +1,14 @@ - hosts: all tasks: + - name: Get slurm version + command: sinfo -V + changed_when: false + register: slurm_version + + - debug: + var: slurm_version.stdout + # normally only started on init - name: check starting slurm-resources become: true diff --git a/tasks/main.yml b/tasks/main.yml index ca541cc..a80df03 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -6,6 +6,8 @@ update_cache: true - name: add ansible repository + # ansible directly available in ubuntu 20.04: https://github.com/ansible/ansible/pull/69161 + when: ansible_facts['distribution_major_version'] | int < 20 become: true apt_repository: repo: 'ppa:ansible/ansible' @@ -56,6 +58,10 @@ src: slurm.conf owner: "{{ slurm_user }}" dest: /etc/slurm-llnl/slurm.conf + vars: + # must match those defined in the .service files + # changed from surm version 17 -> 19 + slurm_pid_dir: "{{ '/var/run/slurm-llnl' if ansible_facts['distribution_major_version'] | int < 20 else '/run' }}" register: conf_template - include_tasks: resources_service.yml diff --git a/templates/slurm.conf b/templates/slurm.conf index 0910cc4..f87e0cf 100644 --- a/templates/slurm.conf +++ b/templates/slurm.conf @@ -10,9 +10,9 @@ MpiDefault=none #MpiParams=ports=#-# ProctrackType=proctrack/pgid ReturnToService=1 -SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid +SlurmctldPidFile={{ slurm_pid_dir }}/slurmctld.pid #SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid +SlurmdPidFile={{ slurm_pid_dir }}/slurmd.pid #SlurmdPort=6818 SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd SlurmUser={{ slurm_user }} diff --git a/tox.ini b/tox.ini index 870a690..b67a098 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ # configuration to run via tox # to use a specific docker image: `MOLECULE_DISTRO=ubuntu1804 tox` -# one of: ubuntu1604, ubuntu1804, ubuntu2004, centos8, fedora31 +# one of: ubuntu1604, ubuntu1804, ubuntu2004 [tox] envlist = molecule