Skip to content

Commit

Permalink
Add script and service to automatically update computer resources (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisjsewell authored Jul 2, 2020
2 parents e0451c7 + 2517853 commit db875b8
Show file tree
Hide file tree
Showing 11 changed files with 170 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*~
*.swp
.DS_Store
.vscode/
.galaxy_install_info
6 changes: 5 additions & 1 deletion defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
---
slurm_hostname: qmobile
slurm_cpus: 2
# Size of real memory on the node in megabytes
slurm_memory: 1024
slurm_user: slurm
Expand All @@ -12,3 +11,8 @@ slurm_test_folder: "/tmp/slurm-tests"
# Adds a system service that forces the hostname on startup
# Note: This is e.g. necessary on cloud platforms like AWS
slurm_hostname_service: false
slurm_set_hostname: false

# Adds a system service that re-configures the slurm compute resources on startup
# Note: This is necessary when preparing image that can start on a different hardware than it was built on
slurm_resources_service: false
49 changes: 49 additions & 0 deletions files/slurm-resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/sh
### BEGIN INIT INFO
# Provides: set-resources
# Required-Start: $remote_fs $syslog
# Required-Stop: $remote_fs $syslog
# X-Start-Before: slurmctld slurmd
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Set SLURM compute resources
# Description: Set SLURM compute resources
### END INIT INFO

run_update ()
{
# TODO ideally we would also output stderr to the log file
# but for some reason 2>&1 makes $* in slurm-update-resources also read the log path as a variable
export ANSIBLE_LOCAL_TEMP=$HOME/.ansible/tmp
export ANSIBLE_REMOTE_TEMP=$HOME/.ansible/tmp
/usr/bin/slurm-update-resources -v
}

case "$1" in
start)
run_update
;;
stop)
echo "stopped"
;;
restart)
run_update
;;
try-restart|condrestart)
run_update
;;
reload)
run_update
;;
force-reload)
run_update
;;
status)
echo "no status to report"
;;
*)
COMMANDS="start|stop|restart|try-restart|reload|force-reload|status"
echo "Usage: `basename \"$0\"` ($COMMANDS)" >&2
exit 2 # invalid or excess argument(s)
;;
esac
14 changes: 14 additions & 0 deletions files/update-slurm-resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
echo "This script calls ansible-playbook to update the slurm configuration"
echo "You can add options such as:"
echo ""
echo "- Restart SLURM (for changes to take affect): -e restart_slurm=true"
echo "- Restart SLURM only on changes: -e restart_on_change=true"
echo "- Change the maximum cpus allowed: -e slurm_max_cpus=2"
echo "- Change the verbosity with: -vv"
exit 0
fi

/usr/bin/ansible-playbook /etc/slurm-llnl/update-playbook.yml $@
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
vars:
- run_tests: true
- cloud_platform: docker
- slurm_hostname_service: true
- slurm_resources_service: true

roles:
- role: marvel-nccr.slurm
2 changes: 1 addition & 1 deletion tasks/hostname.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
line: "\\1 {{ slurm_hostname }}"
backrefs: true
unsafe_writes: true
when: cloud_platform is defined and cloud_platform == 'docker'
when: slurm_set_hostname or (cloud_platform is defined and cloud_platform == 'docker')
24 changes: 24 additions & 0 deletions tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
---
- name: add ansible repository
become: true
apt_repository:
repo: 'ppa:ansible/ansible'
state: present

- name: Install apt packages
become: true
apt:
Expand All @@ -8,6 +14,7 @@
- slurm-wlm-basic-plugins-dev
- munge
- sendmail
- ansible
state: present
update_cache: true

Expand Down Expand Up @@ -43,6 +50,23 @@
src: slurm.conf
owner: "{{ slurm_user }}"
dest: /etc/slurm-llnl/slurm.conf
register: conf_template

- name: create confinguration update playbook
become: true
template:
src: config-playbook.yml.j2
dest: /etc/slurm-llnl/update-playbook.yml

- name: copy confinguration update executable
become: true
copy:
src: update-slurm-resources.sh
dest: /usr/bin/slurm-update-resources
mode: u=rwx,g=rx,o=rx

- include_tasks: resources_service.yml
when: slurm_resources_service

# munge key already created by apt-get install
# will not overwrite key due to 'creates' flag
Expand Down
22 changes: 22 additions & 0 deletions tasks/resources_service.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
# Adds a system service that re-configures the slurm compute resources on startup
# Note: This is necessary when preparing image that can start on a different hardware than it was built on

- name: copy init.d template
become: true
copy:
src: "slurm-resources.sh"
dest: "/etc/init.d/slurm-resources"
mode: 0755

- name: enable service to set the slurm compute resources
become: true
command: update-rc.d slurm-resources defaults
args:
creates: "/etc/rc5.d/S??slurm-resources"

- name: start service to set the slurm compute resources
become: true
service:
name: slurm-resources
state: started
7 changes: 6 additions & 1 deletion tasks/tests.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
---
- name: Creates directory
- name: run configuration update script
become: true
command: /usr/bin/slurm-update-resources -e restart_on_change=true
changed_when: false

- name: create test directory
file:
path: "{{ slurm_test_folder }}"
state: directory
Expand Down
38 changes: 38 additions & 0 deletions templates/config-playbook.yml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
- name: Update SLURM configuration
hosts: localhost

vars:
slurm_hostname: "{{ slurm_hostname }}"
slurm_partition_name: "{{ slurm_partition_name }}"
{% raw %}
slurm_conf_file: /etc/slurm-llnl/slurm.conf
slurm_max_cpus: "{{ ansible_processor_vcpus }}"
restart_on_change: false
restart_slurm: false

tasks:

- debug:
msg: "Run: {{ lookup('pipe', 'date +%Y-%m-%d-%H:%M:%S') }}"

- name: Update SLURM configuration
become: true
blockinfile:
path: "{{ slurm_conf_file }}"
marker: "# {mark} ANSIBLE MANAGED NODES"
block: |
NodeName={{ slurm_hostname }} Sockets={{ ansible_processor_count }} CoresPerSocket={{ ansible_processor_cores }} ThreadsPerCore={{ ansible_processor_threads_per_core }} State=UNKNOWN
PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={{ slurm_max_cpus }}
backup: yes
register: update

- name: Restart Slurm
when: (update.changed and restart_on_change | bool) or (restart_slurm | bool)
become: true
service:
name: "{{ item }}"
state: restarted
with_items:
- slurmctld
- slurmd
{% endraw %}
11 changes: 7 additions & 4 deletions templates/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ SchedulerType=sched/backfill
# cons_res: schedule individual cores
SelectType=select/cons_res
SelectTypeParameters=CR_Core
#
# this ensures submissions fail if they ask for more resources than available on the partition
EnforcePartLimits=ALL
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
Expand All @@ -50,6 +52,7 @@ SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
SlurmdLogFile=/var/log/slurm-llnl/slurmd.log
#
#
# COMPUTE NODES
NodeName={{ slurm_hostname }} CPUs={{ slurm_cpus }} CoresPerSocket={{ slurm_cpus }} State=UNKNOWN
PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP
# BEGIN ANSIBLE MANAGED NODES
NodeName={{ slurm_hostname }} Sockets={{ ansible_processor_count }} CoresPerSocket={{ ansible_processor_cores }} ThreadsPerCore={{ ansible_processor_threads_per_core }} State=UNKNOWN
PartitionName={{ slurm_partition_name }} Nodes={{ slurm_hostname }} Default=YES MaxTime=INFINITE State=UP MaxNodes=1 MaxCPUsPerNode={{ ansible_processor_vcpus }}
# END ANSIBLE MANAGED NODES

0 comments on commit db875b8

Please sign in to comment.