From d1b9a2f2547018103d32ddb7243b11f6f4ef9922 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 9 Dec 2020 01:32:01 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=91=8C=20IMPROVE:=20Move=20resource=20ser?= =?UTF-8?q?vice=20to=20systemd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +-- defaults/main.yml | 8 ++--- files/slurm-resources.service | 18 +++++++++++ files/slurm-resources.sh | 57 +++++++-------------------------- files/update-slurm-resources.sh | 14 -------- molecule/default/converge.yml | 10 ------ molecule/default/molecule.yml | 19 +++++++++-- molecule/default/verify.yml | 13 ++++++++ tasks/hostname.yml | 5 +-- tasks/main.yml | 18 ++--------- tasks/resources_service.yml | 35 +++++++++++++------- tasks/tests.yml | 2 +- 12 files changed, 93 insertions(+), 111 deletions(-) create mode 100644 files/slurm-resources.service mode change 100644 => 100755 files/slurm-resources.sh delete mode 100755 files/update-slurm-resources.sh create mode 100644 molecule/default/verify.yml diff --git a/README.md b/README.md index d275dc2..f826ebf 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,9 @@ An Ansible role that installs the [slurm](https://slurm.schedmd.com/) workload manager on Ubuntu. -**NOTE!** It is important that the hostname is properly set in the machine -(both with `hostname ` and in the `/etc/hosts/` file, in the line with the IP address (e.g. in docker this line should read `172.17.0.2 `, where `` should be replaced with the hostname, and should match the variable `slurm_hostname` (default value: `qmobile`). +**NOTE!** +It is important that the hostname is properly set in the machine +(both with `hostname ` (which sets `/etc/hostname`) and in the `/etc/hosts` file, in the line with the IP address (e.g. in docker this line should read `172.x.x.x `, where `` should be replaced with the hostname, and should match the variable `slurm_hostname` (default value: `qmobile`). ## Installation diff --git a/defaults/main.yml b/defaults/main.yml index 04cf088..1fa487f 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,6 +1,4 @@ slurm_hostname: qmobile -# Size of real memory on the node in megabytes -slurm_memory: 1024 slurm_user: slurm slurm_cluster_name: "{{ slurm_hostname }}" slurm_partition_name: jobs @@ -12,6 +10,6 @@ slurm_test_folder: "/tmp/slurm-tests" slurm_hostname_service: false slurm_set_hostname: false -# Adds a system service that re-configures the slurm compute resources on startup -# Note: This is necessary when preparing image that can start on a different hardware than it was built on -slurm_resources_service: false +# Enables the slurm-resources system service that re-configures the slurm compute resources on startup +# This is necessary when preparing image that can start on a different hardware than it was built on +slurm_resources_service_enabled: true diff --git a/files/slurm-resources.service b/files/slurm-resources.service new file mode 100644 index 0000000..cf44b15 --- /dev/null +++ b/files/slurm-resources.service @@ -0,0 +1,18 @@ +[Unit] +Description=Set SLURM compute resources +Before=slurmctld.target slurmd.target +ConditionPathExists=/etc/slurm-llnl/update-playbook.yml + +[Service] +Type=oneshot +Environment=ANSIBLE_LOCAL_TEMP=%h/.ansible/tmp +Environment=ANSIBLE_REMOTE_TEMP=%h/.ansible/tmp +ExecStart=/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml -v +TimeoutStartSec=60 + +StandardOutput=syslog +StandardError=syslog +SyslogIdentifier=slurm-resources + +[Install] +WantedBy=multi-user.target diff --git a/files/slurm-resources.sh b/files/slurm-resources.sh old mode 100644 new mode 100755 index 23ab51a..db22077 --- a/files/slurm-resources.sh +++ b/files/slurm-resources.sh @@ -1,49 +1,14 @@ #!/bin/sh -### BEGIN INIT INFO -# Provides: set-resources -# Required-Start: $remote_fs $syslog -# Required-Stop: $remote_fs $syslog -# X-Start-Before: slurmctld slurmd -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: Set SLURM compute resources -# Description: Set SLURM compute resources -### END INIT INFO -run_update () -{ -# TODO ideally we would also output stderr to the log file -# but for some reason 2>&1 makes $* in slurm-update-resources also read the log path as a variable -export ANSIBLE_LOCAL_TEMP=$HOME/.ansible/tmp -export ANSIBLE_REMOTE_TEMP=$HOME/.ansible/tmp -/usr/bin/slurm-update-resources -v -} +if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + echo "This script calls ansible-playbook to update the slurm configuration" + echo "You can add options such as:" + echo "" + echo "- Restart SLURM (for changes to take affect): -e restart_slurm=true" + echo "- Restart SLURM only on changes: -e restart_on_change=true" + echo "- Change the maximum cpus allowed: -e slurm_max_cpus=2" + echo "- Change the verbosity with: -vv" + exit 0 +fi -case "$1" in - start) - run_update - ;; - stop) - echo "stopped" - ;; - restart) - run_update - ;; - try-restart|condrestart) - run_update - ;; - reload) - run_update - ;; - force-reload) - run_update - ;; - status) - echo "no status to report" - ;; - *) - COMMANDS="start|stop|restart|try-restart|reload|force-reload|status" - echo "Usage: `basename \"$0\"` ($COMMANDS)" >&2 - exit 2 # invalid or excess argument(s) - ;; -esac +/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $@ diff --git a/files/update-slurm-resources.sh b/files/update-slurm-resources.sh deleted file mode 100755 index db22077..0000000 --- a/files/update-slurm-resources.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then - echo "This script calls ansible-playbook to update the slurm configuration" - echo "You can add options such as:" - echo "" - echo "- Restart SLURM (for changes to take affect): -e restart_slurm=true" - echo "- Restart SLURM only on changes: -e restart_on_change=true" - echo "- Change the maximum cpus allowed: -e slurm_max_cpus=2" - echo "- Change the verbosity with: -vv" - exit 0 -fi - -/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $@ diff --git a/molecule/default/converge.yml b/molecule/default/converge.yml index a7a04ff..4e09aa8 100644 --- a/molecule/default/converge.yml +++ b/molecule/default/converge.yml @@ -1,19 +1,9 @@ - hosts: all - pre_tasks: - - - name: Update apt cache. - become: true - apt: - update_cache: yes - cache_valid_time: 600 - when: ansible_os_family == 'Debian' - vars: - run_tests: true - cloud_platform: docker - slurm_hostname_service: true - - slurm_resources_service: true roles: - role: marvel-nccr.slurm diff --git a/molecule/default/molecule.yml b/molecule/default/molecule.yml index b187c63..723818d 100644 --- a/molecule/default/molecule.yml +++ b/molecule/default/molecule.yml @@ -6,7 +6,8 @@ driver: platforms: - name: instance image: "marvelnccr/docker-${MOLECULE_DISTRO:-ubuntu1804}-ansible:latest" - command: "${MOLECULE_DOCKER_COMMAND:-sleep infinity}" + # by default the container will initialise with systemd as PID1 + command: ${MOLECULE_DOCKER_COMMAND:-""} volumes: - /sys/fs/cgroup:/sys/fs/cgroup:ro privileged: true @@ -16,7 +17,19 @@ platforms: LANG: "en_US.UTF-8" provisioner: name: ansible + # log: true # for debugging playbooks: converge: converge.yml -# Remove Ansible no_log settings for debugging -# log: true + verify: verify.yml + config_options: + defaults: + # nicer stdout printing + stdout_callback: yaml + bin_ansible_callbacks: true + # add timing to tasks + callback_whitelist: profile_tasks + inventory: + hosts: + all: + vars: + ansible_python_interpreter: /usr/bin/python3 diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml new file mode 100644 index 0000000..5dc51f4 --- /dev/null +++ b/molecule/default/verify.yml @@ -0,0 +1,13 @@ +- hosts: all + + tasks: + # normally only started on init + - name: check starting slurm-resources + become: true + systemd: + name: slurm-resources.service + state: started + + - name: check running slurm-resources script + command: slurm-resources -e restart_slurm=true -e slurm_max_cpus=2 + changed_when: false diff --git a/tasks/hostname.yml b/tasks/hostname.yml index 4a345ba..7efe1e1 100644 --- a/tasks/hostname.yml +++ b/tasks/hostname.yml @@ -7,11 +7,12 @@ when: not (cloud_platform is defined and cloud_platform == 'docker') # Yaml requires escaping backslashes in double quotes but not in single quotes -# This works for docker, both locally and on travis +# This usually works for docker, but it is not ideal +# Its better to set the hostname when starting the docker container (available as an option) - name: Set hostname in /etc/hosts lineinfile: path: /etc/hosts - regexp: "^(172\\.17\\.[0-9]+\\.[0-9]+)\\s+.*$" + regexp: "^(172\\.[0-9]+\\.[0-9]+\\.[0-9]+)\\s+.*$" line: "\\1 {{ slurm_hostname }}" backrefs: true unsafe_writes: true diff --git a/tasks/main.yml b/tasks/main.yml index a78ef28..ca541cc 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -14,6 +14,8 @@ - name: Install apt packages become: true apt: + update_cache: true + cache_valid_time: 86400 name: - slurm-wlm - slurm-wlm-basic-plugins @@ -21,8 +23,6 @@ - munge - sendmail - ansible - state: present - update_cache: true - name: hide slurm user (created by slurm package) on login screens become: true @@ -58,21 +58,7 @@ dest: /etc/slurm-llnl/slurm.conf register: conf_template -- name: create confinguration update playbook - become: true - template: - src: config-playbook.yml.j2 - dest: /etc/slurm-llnl/update-playbook.yml - -- name: copy confinguration update executable - become: true - copy: - src: update-slurm-resources.sh - dest: /usr/bin/slurm-update-resources - mode: u=rwx,g=rx,o=rx - - include_tasks: resources_service.yml - when: slurm_resources_service # munge key already created by apt-get install # will not overwrite key due to 'creates' flag diff --git a/tasks/resources_service.yml b/tasks/resources_service.yml index 8dba50d..b82647d 100644 --- a/tasks/resources_service.yml +++ b/tasks/resources_service.yml @@ -1,21 +1,32 @@ # Adds a system service that re-configures the slurm compute resources on startup # Note: This is necessary when preparing image that can start on a different hardware than it was built on -- name: copy init.d template +- name: create configuration update playbook + become: true + template: + src: config-playbook.yml.j2 + dest: /etc/slurm-llnl/update-playbook.yml + +- name: copy confinguration update executable become: true copy: - src: "slurm-resources.sh" - dest: "/etc/init.d/slurm-resources" - mode: 0755 + src: slurm-resources.sh + dest: /usr/bin/slurm-resources + mode: u=rwx,g=rx,o=rx -- name: enable service to set the slurm compute resources +- name: Add slurm-resources as system service become: true - command: update-rc.d slurm-resources defaults - args: - creates: "/etc/rc5.d/S??slurm-resources" + copy: + src: slurm-resources.service + dest: /etc/systemd/system + owner: "{{ root_user | default('root') }}" + group: "{{ root_user | default('root') }}" + mode: 0644 + when: ansible_service_mgr == "systemd" -- name: start service to set the slurm compute resources +- name: Setup slurm-resources system service (systemd) become: true - service: - name: slurm-resources - state: started + systemd: + name: slurm-resources.service + enabled: "{{ slurm_resources_service_enabled }}" + when: ansible_service_mgr == "systemd" diff --git a/tasks/tests.yml b/tasks/tests.yml index 12e0fdc..3d184ae 100644 --- a/tasks/tests.yml +++ b/tasks/tests.yml @@ -1,6 +1,6 @@ - name: run configuration update script become: true - command: /usr/bin/slurm-update-resources -e restart_on_change=true + command: /usr/bin/slurm-resources -e restart_on_change=true changed_when: false - name: create test directory