From eadccd8806c1f2e31b87f971b031bba4beb268b0 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 1 Jul 2020 15:55:15 +0100 Subject: [PATCH 1/7] Add script to update computer resources --- .gitignore | 2 ++ ansible.cfg | 0 defaults/main.yml | 1 - tasks/main.yml | 13 +++++++ templates/slurm.conf | 11 +++--- templates/update_conf.j2.py | 72 +++++++++++++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 ansible.cfg create mode 100755 templates/update_conf.j2.py diff --git a/.gitignore b/.gitignore index ae98342..82973fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *~ *.swp +.DS_Store +.vscode/ .galaxy_install_info diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 0000000..e69de29 diff --git a/defaults/main.yml b/defaults/main.yml index 04657f3..a48458b 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,6 +1,5 @@ --- slurm_hostname: qmobile -slurm_cpus: 2 # Size of real memory on the node in megabytes slurm_memory: 1024 slurm_user: slurm diff --git a/tasks/main.yml b/tasks/main.yml index e83e433..b172273 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -43,6 +43,19 @@ src: slurm.conf owner: "{{ slurm_user }}" dest: /etc/slurm-llnl/slurm.conf + register: conf_template + +- name: create confinguration update script + template: + src: update_conf.j2.py + dest: /usr/bin/update-slurm-conf + mode: "755" + register: conf_script + +- name: run configuration update script # noqa 503 + when: conf_template.changed or conf_script.changed + command: /usr/bin/update-slurm-conf + changed_when: false # munge key already created by apt-get install # will not overwrite key due to 'creates' flag diff --git a/templates/slurm.conf b/templates/slurm.conf index 7e06680..8257f90 100644 --- a/templates/slurm.conf +++ b/templates/slurm.conf @@ -37,7 +37,9 @@ SchedulerType=sched/backfill # cons_res: schedule individual cores SelectType=select/cons_res SelectTypeParameters=CR_Core -# +# this ensures submissions fail if they ask for more resources than available on the partition +EnforcePartLimits=ALL +# # # LOGGING AND ACCOUNTING AccountingStorageType=accounting_storage/none @@ -50,6 +52,7 @@ SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log SlurmdLogFile=/var/log/slurm-llnl/slurmd.log # # -# COMPUTE NODES -NodeName={{ slurm_hostname }} CPUs={{ slurm_cpus }} CoresPerSocket={{ slurm_cpus }} State=UNKNOWN -PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP +# COMPUTER NODES START +NodeName={{ slurm_hostname }} Sockets={{ ansible_processor_count }} CoresPerSocket={{ ansible_processor_cores }} ThreadsPerCore={{ ansible_processor_threads_per_core }} State=UNKNOWN +PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={{ ansible_processor_vcpus }} +# COMPUTER NODES END diff --git a/templates/update_conf.j2.py b/templates/update_conf.j2.py new file mode 100755 index 0000000..45350ee --- /dev/null +++ b/templates/update_conf.j2.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +from difflib import context_diff +import json +import re +import subprocess + + +def update_cpus(hostname, partition_name, conf_file="/etc/slurm-llnl/slurm.conf"): + # read cpu information (linux only) + data = subprocess.check_output(["lscpu", "--json"]) + # convert from string + data = json.loads(data) + # convert from list + data = {entry["field"]: entry["data"] for entry in data["lscpu"]} + # get required data + thread_per_core = data["Thread(s) per core:"] + cores_per_socket = data["Core(s) per socket:"] + sockets = data["Socket(s):"] + # cpus = sockets * cores_per_socket * thread_per_core + cpus = data["CPU(s):"] + # create replacement lines + lines = open(conf_file).read().splitlines() + new_lines = [] + found_nodes = False + in_nodes = False + for line in lines: + if line.startswith("# COMPUTER NODES START"): + if found_nodes: + raise IOError("'# COMPUTER NODES START' found multiple times") + found_nodes = True + in_nodes = True + elif line.startswith("# COMPUTER NODES END"): + if not in_nodes: + raise IOError("'# COMPUTER NODES END' found before start") + in_nodes = False + new_lines.extend( + [ + "# COMPUTER NODES START", + "NodeName={0} Sockets={1} CoresPerSocket={2} ThreadsPerCore={3} State=UNKNOWN".format( + hostname, sockets, cores_per_socket, thread_per_core + ), + "PartitionName={0} Nodes={1} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={2}".format( + partition_name, hostname, cpus + ), + "# COMPUTER NODES END", + ] + ) + elif not in_nodes: + new_lines.append(line) + + # check for errors + if not found_nodes: + raise IOError("'# COMPUTER NODES START' not found") + if in_nodes: + raise IOError("'# COMPUTER NODES END' not found") + + # overwrite file if changes found + if lines != new_lines: + print( + "computer resources changed:\n" + + "\n".join( + context_diff(lines, new_lines, fromfile="before", tofile="after") + ), + ) + with open(conf_file, "w") as handle: + handle.write("\n".join(new_lines)) + else: + print("computer resources unchanged") + + +if __name__ == "__main__": + update_cpus("{{ slurm_hostname }}", "{{ slurm_partition_name }}") From b62ef9f6ace8b8e7a3415a4f2b5773c1461f142d Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 1 Jul 2020 21:07:11 +0100 Subject: [PATCH 2/7] use ansible for updating compute resources --- defaults/main.yml | 5 +++ files/set-resources.sh | 13 ++++++++ files/update-slurm-conf.sh | 11 +++++++ .../default/{playbook.yml => converge.yml} | 2 ++ tasks/hostname.yml | 2 +- tasks/main.yml | 20 ++++++----- tasks/resources_service.yml | 22 +++++++++++++ tasks/tests.yml | 6 +++- templates/config-playbook.yml.j2 | 33 +++++++++++++++++++ templates/slurm.conf | 4 +-- templates/update_conf.j2.py | 4 ++- 11 files changed, 108 insertions(+), 14 deletions(-) create mode 100644 files/set-resources.sh create mode 100755 files/update-slurm-conf.sh rename molecule/default/{playbook.yml => converge.yml} (79%) create mode 100644 tasks/resources_service.yml create mode 100644 templates/config-playbook.yml.j2 diff --git a/defaults/main.yml b/defaults/main.yml index a48458b..a0b7ac5 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -11,3 +11,8 @@ slurm_test_folder: "/tmp/slurm-tests" # Adds a system service that forces the hostname on startup # Note: This is e.g. necessary on cloud platforms like AWS slurm_hostname_service: false +slurm_set_hostname: false + +# Adds a system service that re-configures the slurm compute resources on startup +# Note: This is necessary when preparing image that can start on a different hardware than it was built on +slurm_resources_service: false diff --git a/files/set-resources.sh b/files/set-resources.sh new file mode 100644 index 0000000..42edaac --- /dev/null +++ b/files/set-resources.sh @@ -0,0 +1,13 @@ +#!/bin/sh +### BEGIN INIT INFO +# Provides: set-resources +# Required-Start: $remote_fs $syslog +# Required-Stop: $remote_fs $syslog +# X-Start-Before: slurmctld slurmd +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Set SLURM compute resources +# Description: Set SLURM compute resources +### END INIT INFO + +echo /usr/bin/slurm-update-resources diff --git a/files/update-slurm-conf.sh b/files/update-slurm-conf.sh new file mode 100755 index 0000000..7d15a95 --- /dev/null +++ b/files/update-slurm-conf.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + echo "This script calls ansible-playbook to update the slurm configuration" + echo "You can change the maximum cpus allowed by adding '-e slurm_max_cpus=2'" + echo "" + /usr/bin/ansible-playbook --help + exit 0 +fi + +/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $1 diff --git a/molecule/default/playbook.yml b/molecule/default/converge.yml similarity index 79% rename from molecule/default/playbook.yml rename to molecule/default/converge.yml index 461666a..7fd61ed 100644 --- a/molecule/default/playbook.yml +++ b/molecule/default/converge.yml @@ -10,6 +10,8 @@ vars: - run_tests: true - cloud_platform: docker + - slurm_hostname_service: true + - slurm_resources_service: true roles: - role: marvel-nccr.slurm diff --git a/tasks/hostname.yml b/tasks/hostname.yml index 9a677ce..ad88d39 100644 --- a/tasks/hostname.yml +++ b/tasks/hostname.yml @@ -13,4 +13,4 @@ line: "\\1 {{ slurm_hostname }}" backrefs: true unsafe_writes: true - when: cloud_platform is defined and cloud_platform == 'docker' + when: slurm_set_hostname or (cloud_platform is defined and cloud_platform == 'docker') diff --git a/tasks/main.yml b/tasks/main.yml index b172273..7986250 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -45,17 +45,19 @@ dest: /etc/slurm-llnl/slurm.conf register: conf_template -- name: create confinguration update script +- name: create confinguration update playbook template: - src: update_conf.j2.py - dest: /usr/bin/update-slurm-conf - mode: "755" - register: conf_script + src: config-playbook.yml.j2 + dest: /etc/slurm-llnl/update-playbook.yml -- name: run configuration update script # noqa 503 - when: conf_template.changed or conf_script.changed - command: /usr/bin/update-slurm-conf - changed_when: false +- name: copy confinguration update executable + copy: + src: update-slurm-conf.sh + dest: /usr/bin/slurm-update-resources + mode: u=rwx,g=rx + +- include_tasks: resources_service.yml + when: slurm_resources_service # munge key already created by apt-get install # will not overwrite key due to 'creates' flag diff --git a/tasks/resources_service.yml b/tasks/resources_service.yml new file mode 100644 index 0000000..71bd166 --- /dev/null +++ b/tasks/resources_service.yml @@ -0,0 +1,22 @@ +--- +# Adds a system service that re-configures the slurm compute resources on startup +# Note: This is necessary when preparing image that can start on a different hardware than it was built on + +- name: copy init.d template + become: true + copy: + src: "set-resources.sh" + dest: "/etc/init.d/set-resources" + mode: 0755 + +- name: enable service to set the slurm compute resources + become: true + command: update-rc.d set-resources defaults + args: + creates: "/etc/rc5.d/S??set-resources" + +- name: start service to set the slurm compute resources + become: true + service: + name: set-resources + state: started diff --git a/tasks/tests.yml b/tasks/tests.yml index 65959e1..b5507ec 100644 --- a/tasks/tests.yml +++ b/tasks/tests.yml @@ -1,5 +1,9 @@ --- -- name: Creates directory +- name: run configuration update script + command: /usr/bin/slurm-update-resources + changed_when: false + +- name: create test directory file: path: "{{ slurm_test_folder }}" state: directory diff --git a/templates/config-playbook.yml.j2 b/templates/config-playbook.yml.j2 new file mode 100644 index 0000000..cf8a5d5 --- /dev/null +++ b/templates/config-playbook.yml.j2 @@ -0,0 +1,33 @@ +- name: Update SLURM configuration + hosts: localhost + + vars: + slurm_hostname: "{{ slurm_hostname }}" + slurm_partition_name: "{{ slurm_partition_name }}" +{% raw %} + slurm_conf_file: /etc/slurm-llnl/slurm.conf + slurm_max_cpus: "{{ ansible_processor_vcpus }}" + update_on_change: true + + tasks: + + - name: Update SLURM configuration + blockinfile: + path: "{{ slurm_conf_file }}" + marker: "# {mark} ANSIBLE MANAGED NODES" + block: | + NodeName={{ slurm_hostname }} Sockets={{ ansible_processor_count }} CoresPerSocket={{ ansible_processor_cores }} ThreadsPerCore={{ ansible_processor_threads_per_core }} State=UNKNOWN + PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={{ slurm_max_cpus }} + backup: yes + register: update + + - name: Restart Slurm + when: update.changed and update_on_change + become: true + service: + name: "{{ item }}" + state: restarted + with_items: + - slurmctld + - slurmd +{% endraw %} diff --git a/templates/slurm.conf b/templates/slurm.conf index 8257f90..0910cc4 100644 --- a/templates/slurm.conf +++ b/templates/slurm.conf @@ -52,7 +52,7 @@ SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log SlurmdLogFile=/var/log/slurm-llnl/slurmd.log # # -# COMPUTER NODES START +# BEGIN ANSIBLE MANAGED NODES NodeName={{ slurm_hostname }} Sockets={{ ansible_processor_count }} CoresPerSocket={{ ansible_processor_cores }} ThreadsPerCore={{ ansible_processor_threads_per_core }} State=UNKNOWN PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={{ ansible_processor_vcpus }} -# COMPUTER NODES END +# END ANSIBLE MANAGED NODES diff --git a/templates/update_conf.j2.py b/templates/update_conf.j2.py index 45350ee..a23f636 100755 --- a/templates/update_conf.j2.py +++ b/templates/update_conf.j2.py @@ -1,4 +1,6 @@ -#!/usr/bin/env python +#!{{ ansible_python_interpreter | default('/usr/bin/python') }} +from __future__ import print_function + from difflib import context_diff import json import re From 4e1a7879c33706652bc7d84860d85f42fbc41723 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 1 Jul 2020 21:32:03 +0100 Subject: [PATCH 3/7] change name --- files/{set-resources.sh => slurm-resources.sh} | 2 +- tasks/resources_service.yml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) rename files/{set-resources.sh => slurm-resources.sh} (79%) diff --git a/files/set-resources.sh b/files/slurm-resources.sh similarity index 79% rename from files/set-resources.sh rename to files/slurm-resources.sh index 42edaac..5d5b633 100644 --- a/files/set-resources.sh +++ b/files/slurm-resources.sh @@ -10,4 +10,4 @@ # Description: Set SLURM compute resources ### END INIT INFO -echo /usr/bin/slurm-update-resources +echo /usr/bin/slurm-update-resources -e update_on_change=false &> /var/log/slurm-update.log diff --git a/tasks/resources_service.yml b/tasks/resources_service.yml index 71bd166..8e2be83 100644 --- a/tasks/resources_service.yml +++ b/tasks/resources_service.yml @@ -5,18 +5,18 @@ - name: copy init.d template become: true copy: - src: "set-resources.sh" - dest: "/etc/init.d/set-resources" + src: "slurm-resources.sh" + dest: "/etc/init.d/slurm-resources" mode: 0755 - name: enable service to set the slurm compute resources become: true - command: update-rc.d set-resources defaults + command: update-rc.d slurm-resources defaults args: - creates: "/etc/rc5.d/S??set-resources" + creates: "/etc/rc5.d/S??slurm-resources" - name: start service to set the slurm compute resources become: true service: - name: set-resources + name: slurm-resources state: started From 52bedfec72a41df96360a97663d04056b58dc0ec Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 1 Jul 2020 22:19:22 +0100 Subject: [PATCH 4/7] remove old file --- ...lurm-conf.sh => update-slurm-resources.sh} | 0 tasks/main.yml | 2 +- templates/update_conf.j2.py | 74 ------------------- 3 files changed, 1 insertion(+), 75 deletions(-) rename files/{update-slurm-conf.sh => update-slurm-resources.sh} (100%) delete mode 100755 templates/update_conf.j2.py diff --git a/files/update-slurm-conf.sh b/files/update-slurm-resources.sh similarity index 100% rename from files/update-slurm-conf.sh rename to files/update-slurm-resources.sh diff --git a/tasks/main.yml b/tasks/main.yml index 7986250..6ca7361 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -52,7 +52,7 @@ - name: copy confinguration update executable copy: - src: update-slurm-conf.sh + src: update-slurm-update-slurm-resources.sh dest: /usr/bin/slurm-update-resources mode: u=rwx,g=rx diff --git a/templates/update_conf.j2.py b/templates/update_conf.j2.py deleted file mode 100755 index a23f636..0000000 --- a/templates/update_conf.j2.py +++ /dev/null @@ -1,74 +0,0 @@ -#!{{ ansible_python_interpreter | default('/usr/bin/python') }} -from __future__ import print_function - -from difflib import context_diff -import json -import re -import subprocess - - -def update_cpus(hostname, partition_name, conf_file="/etc/slurm-llnl/slurm.conf"): - # read cpu information (linux only) - data = subprocess.check_output(["lscpu", "--json"]) - # convert from string - data = json.loads(data) - # convert from list - data = {entry["field"]: entry["data"] for entry in data["lscpu"]} - # get required data - thread_per_core = data["Thread(s) per core:"] - cores_per_socket = data["Core(s) per socket:"] - sockets = data["Socket(s):"] - # cpus = sockets * cores_per_socket * thread_per_core - cpus = data["CPU(s):"] - # create replacement lines - lines = open(conf_file).read().splitlines() - new_lines = [] - found_nodes = False - in_nodes = False - for line in lines: - if line.startswith("# COMPUTER NODES START"): - if found_nodes: - raise IOError("'# COMPUTER NODES START' found multiple times") - found_nodes = True - in_nodes = True - elif line.startswith("# COMPUTER NODES END"): - if not in_nodes: - raise IOError("'# COMPUTER NODES END' found before start") - in_nodes = False - new_lines.extend( - [ - "# COMPUTER NODES START", - "NodeName={0} Sockets={1} CoresPerSocket={2} ThreadsPerCore={3} State=UNKNOWN".format( - hostname, sockets, cores_per_socket, thread_per_core - ), - "PartitionName={0} Nodes={1} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={2}".format( - partition_name, hostname, cpus - ), - "# COMPUTER NODES END", - ] - ) - elif not in_nodes: - new_lines.append(line) - - # check for errors - if not found_nodes: - raise IOError("'# COMPUTER NODES START' not found") - if in_nodes: - raise IOError("'# COMPUTER NODES END' not found") - - # overwrite file if changes found - if lines != new_lines: - print( - "computer resources changed:\n" - + "\n".join( - context_diff(lines, new_lines, fromfile="before", tofile="after") - ), - ) - with open(conf_file, "w") as handle: - handle.write("\n".join(new_lines)) - else: - print("computer resources unchanged") - - -if __name__ == "__main__": - update_cpus("{{ slurm_hostname }}", "{{ slurm_partition_name }}") From 3214405bd06cd478e6139c7258b97ab1fe86e437 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Wed, 1 Jul 2020 22:26:23 +0100 Subject: [PATCH 5/7] fix typo --- tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/main.yml b/tasks/main.yml index 6ca7361..2144f4e 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -52,7 +52,7 @@ - name: copy confinguration update executable copy: - src: update-slurm-update-slurm-resources.sh + src: update-slurm-resources.sh dest: /usr/bin/slurm-update-resources mode: u=rwx,g=rx From d2bc6053d60c2f2664dc7ea8a5cae24251ddec2c Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 2 Jul 2020 04:15:34 +0100 Subject: [PATCH 6/7] improve slurm resources service --- files/slurm-resources.sh | 37 +++++++++++++++++++++++++++++++- files/update-slurm-resources.sh | 2 +- templates/config-playbook.yml.j2 | 3 +++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/files/slurm-resources.sh b/files/slurm-resources.sh index 5d5b633..5ff4ba4 100644 --- a/files/slurm-resources.sh +++ b/files/slurm-resources.sh @@ -10,4 +10,39 @@ # Description: Set SLURM compute resources ### END INIT INFO -echo /usr/bin/slurm-update-resources -e update_on_change=false &> /var/log/slurm-update.log +run_update () +{ +# TODO ideally we would also output stderr to the log file +# but for some reason 2>&1 makes $* in slurm-update-resources also read the log path as a variable +/usr/bin/slurm-update-resources -e update_on_change=false > /var/log/slurm-resources.log +cat /var/log/slurm-resources.log +} + +case "$1" in + start) + run_update + ;; + stop) + echo "stopped" + ;; + restart) + run_update + ;; + try-restart|condrestart) + run_update + ;; + reload) + run_update + ;; + force-reload) + run_update + ;; + status) + echo "no status to report" + ;; + *) + COMMANDS="start|stop|restart|try-restart|reload|force-reload|status" + echo "Usage: `basename \"$0\"` ($COMMANDS)" >&2 + exit 2 # invalid or excess argument(s) + ;; +esac diff --git a/files/update-slurm-resources.sh b/files/update-slurm-resources.sh index 7d15a95..536f698 100755 --- a/files/update-slurm-resources.sh +++ b/files/update-slurm-resources.sh @@ -8,4 +8,4 @@ if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then exit 0 fi -/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $1 +/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $* diff --git a/templates/config-playbook.yml.j2 b/templates/config-playbook.yml.j2 index cf8a5d5..9809151 100644 --- a/templates/config-playbook.yml.j2 +++ b/templates/config-playbook.yml.j2 @@ -11,6 +11,9 @@ tasks: + - debug: + msg: "Run: {{ lookup('pipe', 'date +%Y-%m-%d-%H:%M:%S') }}" + - name: Update SLURM configuration blockinfile: path: "{{ slurm_conf_file }}" From 2517853b8d7bc26a90b0a97af16e4d56763f79eb Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 2 Jul 2020 09:03:42 +0100 Subject: [PATCH 7/7] updates after AWS+Vagrant testing --- ansible.cfg | 0 files/slurm-resources.sh | 5 +++-- files/update-slurm-resources.sh | 9 ++++++--- tasks/main.yml | 11 ++++++++++- tasks/tests.yml | 3 ++- templates/config-playbook.yml.j2 | 6 ++++-- 6 files changed, 25 insertions(+), 9 deletions(-) delete mode 100644 ansible.cfg diff --git a/ansible.cfg b/ansible.cfg deleted file mode 100644 index e69de29..0000000 diff --git a/files/slurm-resources.sh b/files/slurm-resources.sh index 5ff4ba4..23ab51a 100644 --- a/files/slurm-resources.sh +++ b/files/slurm-resources.sh @@ -14,8 +14,9 @@ run_update () { # TODO ideally we would also output stderr to the log file # but for some reason 2>&1 makes $* in slurm-update-resources also read the log path as a variable -/usr/bin/slurm-update-resources -e update_on_change=false > /var/log/slurm-resources.log -cat /var/log/slurm-resources.log +export ANSIBLE_LOCAL_TEMP=$HOME/.ansible/tmp +export ANSIBLE_REMOTE_TEMP=$HOME/.ansible/tmp +/usr/bin/slurm-update-resources -v } case "$1" in diff --git a/files/update-slurm-resources.sh b/files/update-slurm-resources.sh index 536f698..db22077 100755 --- a/files/update-slurm-resources.sh +++ b/files/update-slurm-resources.sh @@ -2,10 +2,13 @@ if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then echo "This script calls ansible-playbook to update the slurm configuration" - echo "You can change the maximum cpus allowed by adding '-e slurm_max_cpus=2'" + echo "You can add options such as:" echo "" - /usr/bin/ansible-playbook --help + echo "- Restart SLURM (for changes to take affect): -e restart_slurm=true" + echo "- Restart SLURM only on changes: -e restart_on_change=true" + echo "- Change the maximum cpus allowed: -e slurm_max_cpus=2" + echo "- Change the verbosity with: -vv" exit 0 fi -/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $* +/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $@ diff --git a/tasks/main.yml b/tasks/main.yml index 2144f4e..1c3a368 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1,4 +1,10 @@ --- +- name: add ansible repository + become: true + apt_repository: + repo: 'ppa:ansible/ansible' + state: present + - name: Install apt packages become: true apt: @@ -8,6 +14,7 @@ - slurm-wlm-basic-plugins-dev - munge - sendmail + - ansible state: present update_cache: true @@ -46,15 +53,17 @@ register: conf_template - name: create confinguration update playbook + become: true template: src: config-playbook.yml.j2 dest: /etc/slurm-llnl/update-playbook.yml - name: copy confinguration update executable + become: true copy: src: update-slurm-resources.sh dest: /usr/bin/slurm-update-resources - mode: u=rwx,g=rx + mode: u=rwx,g=rx,o=rx - include_tasks: resources_service.yml when: slurm_resources_service diff --git a/tasks/tests.yml b/tasks/tests.yml index b5507ec..16136df 100644 --- a/tasks/tests.yml +++ b/tasks/tests.yml @@ -1,6 +1,7 @@ --- - name: run configuration update script - command: /usr/bin/slurm-update-resources + become: true + command: /usr/bin/slurm-update-resources -e restart_on_change=true changed_when: false - name: create test directory diff --git a/templates/config-playbook.yml.j2 b/templates/config-playbook.yml.j2 index 9809151..3b8e17c 100644 --- a/templates/config-playbook.yml.j2 +++ b/templates/config-playbook.yml.j2 @@ -7,7 +7,8 @@ {% raw %} slurm_conf_file: /etc/slurm-llnl/slurm.conf slurm_max_cpus: "{{ ansible_processor_vcpus }}" - update_on_change: true + restart_on_change: false + restart_slurm: false tasks: @@ -15,6 +16,7 @@ msg: "Run: {{ lookup('pipe', 'date +%Y-%m-%d-%H:%M:%S') }}" - name: Update SLURM configuration + become: true blockinfile: path: "{{ slurm_conf_file }}" marker: "# {mark} ANSIBLE MANAGED NODES" @@ -25,7 +27,7 @@ register: update - name: Restart Slurm - when: update.changed and update_on_change + when: (update.changed and restart_on_change | bool) or (restart_slurm | bool) become: true service: name: "{{ item }}"