Skip to content

Commit

Permalink
👌 IMPROVE: Move resource service to systemd
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisjsewell committed Dec 9, 2020
1 parent 1cb6dd5 commit d1b9a2f
Show file tree
Hide file tree
Showing 12 changed files with 93 additions and 111 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

An Ansible role that installs the [slurm](https://slurm.schedmd.com/) workload manager on Ubuntu.

**NOTE!** It is important that the hostname is properly set in the machine
(both with `hostname <HOSTNAME>` and in the `/etc/hosts/` file, in the line with the IP address (e.g. in docker this line should read `172.17.0.2 <HOSTNAME>`, where `<HOSTNAME>` should be replaced with the hostname, and should match the variable `slurm_hostname` (default value: `qmobile`).
**NOTE!**
It is important that the hostname is properly set in the machine
(both with `hostname <HOSTNAME>` (which sets `/etc/hostname`) and in the `/etc/hosts` file, in the line with the IP address (e.g. in docker this line should read `172.x.x.x <HOSTNAME>`, where `<HOSTNAME>` should be replaced with the hostname, and should match the variable `slurm_hostname` (default value: `qmobile`).

## Installation

Expand Down
8 changes: 3 additions & 5 deletions defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
slurm_hostname: qmobile
# Size of real memory on the node in megabytes
slurm_memory: 1024
slurm_user: slurm
slurm_cluster_name: "{{ slurm_hostname }}"
slurm_partition_name: jobs
Expand All @@ -12,6 +10,6 @@ slurm_test_folder: "/tmp/slurm-tests"
slurm_hostname_service: false
slurm_set_hostname: false

# Adds a system service that re-configures the slurm compute resources on startup
# Note: This is necessary when preparing image that can start on a different hardware than it was built on
slurm_resources_service: false
# Enables the slurm-resources system service that re-configures the slurm compute resources on startup
# This is necessary when preparing image that can start on a different hardware than it was built on
slurm_resources_service_enabled: true
18 changes: 18 additions & 0 deletions files/slurm-resources.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[Unit]
Description=Set SLURM compute resources
Before=slurmctld.target slurmd.target
ConditionPathExists=/etc/slurm-llnl/update-playbook.yml

[Service]
Type=oneshot
Environment=ANSIBLE_LOCAL_TEMP=%h/.ansible/tmp
Environment=ANSIBLE_REMOTE_TEMP=%h/.ansible/tmp
ExecStart=/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml -v
TimeoutStartSec=60

StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=slurm-resources

[Install]
WantedBy=multi-user.target
57 changes: 11 additions & 46 deletions files/slurm-resources.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,49 +1,14 @@
#!/bin/sh
### BEGIN INIT INFO
# Provides: set-resources
# Required-Start: $remote_fs $syslog
# Required-Stop: $remote_fs $syslog
# X-Start-Before: slurmctld slurmd
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Set SLURM compute resources
# Description: Set SLURM compute resources
### END INIT INFO

run_update ()
{
# TODO ideally we would also output stderr to the log file
# but for some reason 2>&1 makes $* in slurm-update-resources also read the log path as a variable
export ANSIBLE_LOCAL_TEMP=$HOME/.ansible/tmp
export ANSIBLE_REMOTE_TEMP=$HOME/.ansible/tmp
/usr/bin/slurm-update-resources -v
}
if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
echo "This script calls ansible-playbook to update the slurm configuration"
echo "You can add options such as:"
echo ""
echo "- Restart SLURM (for changes to take affect): -e restart_slurm=true"
echo "- Restart SLURM only on changes: -e restart_on_change=true"
echo "- Change the maximum cpus allowed: -e slurm_max_cpus=2"
echo "- Change the verbosity with: -vv"
exit 0
fi

case "$1" in
start)
run_update
;;
stop)
echo "stopped"
;;
restart)
run_update
;;
try-restart|condrestart)
run_update
;;
reload)
run_update
;;
force-reload)
run_update
;;
status)
echo "no status to report"
;;
*)
COMMANDS="start|stop|restart|try-restart|reload|force-reload|status"
echo "Usage: `basename \"$0\"` ($COMMANDS)" >&2
exit 2 # invalid or excess argument(s)
;;
esac
/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $@
14 changes: 0 additions & 14 deletions files/update-slurm-resources.sh

This file was deleted.

10 changes: 0 additions & 10 deletions molecule/default/converge.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,9 @@
- hosts: all

pre_tasks:

- name: Update apt cache.
become: true
apt:
update_cache: yes
cache_valid_time: 600
when: ansible_os_family == 'Debian'

vars:
- run_tests: true
- cloud_platform: docker
- slurm_hostname_service: true
- slurm_resources_service: true

roles:
- role: marvel-nccr.slurm
19 changes: 16 additions & 3 deletions molecule/default/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ driver:
platforms:
- name: instance
image: "marvelnccr/docker-${MOLECULE_DISTRO:-ubuntu1804}-ansible:latest"
command: "${MOLECULE_DOCKER_COMMAND:-sleep infinity}"
# by default the container will initialise with systemd as PID1
command: ${MOLECULE_DOCKER_COMMAND:-""}
volumes:
- /sys/fs/cgroup:/sys/fs/cgroup:ro
privileged: true
Expand All @@ -16,7 +17,19 @@ platforms:
LANG: "en_US.UTF-8"
provisioner:
name: ansible
# log: true # for debugging
playbooks:
converge: converge.yml
# Remove Ansible no_log settings for debugging
# log: true
verify: verify.yml
config_options:
defaults:
# nicer stdout printing
stdout_callback: yaml
bin_ansible_callbacks: true
# add timing to tasks
callback_whitelist: profile_tasks
inventory:
hosts:
all:
vars:
ansible_python_interpreter: /usr/bin/python3
13 changes: 13 additions & 0 deletions molecule/default/verify.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
- hosts: all

tasks:
# normally only started on init
- name: check starting slurm-resources
become: true
systemd:
name: slurm-resources.service
state: started

- name: check running slurm-resources script
command: slurm-resources -e restart_slurm=true -e slurm_max_cpus=2
changed_when: false
5 changes: 3 additions & 2 deletions tasks/hostname.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
when: not (cloud_platform is defined and cloud_platform == 'docker')

# Yaml requires escaping backslashes in double quotes but not in single quotes
# This works for docker, both locally and on travis
# This usually works for docker, but it is not ideal
# Its better to set the hostname when starting the docker container (available as an option)
- name: Set hostname in /etc/hosts
lineinfile:
path: /etc/hosts
regexp: "^(172\\.17\\.[0-9]+\\.[0-9]+)\\s+.*$"
regexp: "^(172\\.[0-9]+\\.[0-9]+\\.[0-9]+)\\s+.*$"
line: "\\1 {{ slurm_hostname }}"
backrefs: true
unsafe_writes: true
Expand Down
18 changes: 2 additions & 16 deletions tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@
- name: Install apt packages
become: true
apt:
update_cache: true
cache_valid_time: 86400
name:
- slurm-wlm
- slurm-wlm-basic-plugins
- slurm-wlm-basic-plugins-dev
- munge
- sendmail
- ansible
state: present
update_cache: true

- name: hide slurm user (created by slurm package) on login screens
become: true
Expand Down Expand Up @@ -58,21 +58,7 @@
dest: /etc/slurm-llnl/slurm.conf
register: conf_template

- name: create confinguration update playbook
become: true
template:
src: config-playbook.yml.j2
dest: /etc/slurm-llnl/update-playbook.yml

- name: copy confinguration update executable
become: true
copy:
src: update-slurm-resources.sh
dest: /usr/bin/slurm-update-resources
mode: u=rwx,g=rx,o=rx

- include_tasks: resources_service.yml
when: slurm_resources_service

# munge key already created by apt-get install
# will not overwrite key due to 'creates' flag
Expand Down
35 changes: 23 additions & 12 deletions tasks/resources_service.yml
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
# Adds a system service that re-configures the slurm compute resources on startup
# Note: This is necessary when preparing image that can start on a different hardware than it was built on

- name: copy init.d template
- name: create configuration update playbook
become: true
template:
src: config-playbook.yml.j2
dest: /etc/slurm-llnl/update-playbook.yml

- name: copy confinguration update executable
become: true
copy:
src: "slurm-resources.sh"
dest: "/etc/init.d/slurm-resources"
mode: 0755
src: slurm-resources.sh
dest: /usr/bin/slurm-resources
mode: u=rwx,g=rx,o=rx

- name: enable service to set the slurm compute resources
- name: Add slurm-resources as system service
become: true
command: update-rc.d slurm-resources defaults
args:
creates: "/etc/rc5.d/S??slurm-resources"
copy:
src: slurm-resources.service
dest: /etc/systemd/system
owner: "{{ root_user | default('root') }}"
group: "{{ root_user | default('root') }}"
mode: 0644
when: ansible_service_mgr == "systemd"

- name: start service to set the slurm compute resources
- name: Setup slurm-resources system service (systemd)
become: true
service:
name: slurm-resources
state: started
systemd:
name: slurm-resources.service
enabled: "{{ slurm_resources_service_enabled }}"
when: ansible_service_mgr == "systemd"
2 changes: 1 addition & 1 deletion tasks/tests.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- name: run configuration update script
become: true
command: /usr/bin/slurm-update-resources -e restart_on_change=true
command: /usr/bin/slurm-resources -e restart_on_change=true
changed_when: false

- name: create test directory
Expand Down

0 comments on commit d1b9a2f

Please sign in to comment.