From dab1e1ba8ee4230befec06d9932fefac77e471eb Mon Sep 17 00:00:00 2001 From: Xavier Pillons Date: Fri, 26 Apr 2024 15:57:07 +0200 Subject: [PATCH] Fix/1898 (#1899) * move update packages in scheduler's roles * move update package role to main slurm task * retry apt update * restart service after update * check pbs connection * stop and start pbs services * typo in sbatch instruction --- .../slurm/impi_pingpong/pingpong.sh | 2 +- playbooks/ood.yml | 2 +- playbooks/roles/pbsserver/tasks/main.yml | 34 ++++++++++++++++--- playbooks/roles/pkg_update/tasks/Ubuntu.yml | 13 +++++++ playbooks/roles/slurm/tasks/main.yml | 8 +++++ playbooks/roles/slurm/tasks/slurmserver.yml | 1 + playbooks/scheduler.yml | 7 ---- 7 files changed, 53 insertions(+), 14 deletions(-) diff --git a/playbooks/files/ood_templates/slurm/impi_pingpong/pingpong.sh b/playbooks/files/ood_templates/slurm/impi_pingpong/pingpong.sh index 6f5dba333..3f4222bbc 100644 --- a/playbooks/files/ood_templates/slurm/impi_pingpong/pingpong.sh +++ b/playbooks/files/ood_templates/slurm/impi_pingpong/pingpong.sh @@ -5,7 +5,7 @@ #SBATCH -p hpc #SBATCH -t 5 #SBATCH --export=NONE -#SBACTH --exclusive +#SBATCH --exclusive source /etc/profile.d/modules.sh module use /usr/share/Modules/modulefiles diff --git a/playbooks/ood.yml b/playbooks/ood.yml index 8d35e3738..220db6799 100644 --- a/playbooks/ood.yml +++ b/playbooks/ood.yml @@ -628,4 +628,4 @@ apply: become: true vars: - packages_to_exclude_from_upgrade: "{{ (['ondemand','amlfs', 'jetpack8'] if ( lustre.create | default(false)) else ['ondemand', 'jetpack8']) }}" + packages_to_exclude_from_upgrade: "{{ (['ondemand','amlfs'] if ( lustre.create | default(false)) else ['ondemand']) }}" diff --git a/playbooks/roles/pbsserver/tasks/main.yml b/playbooks/roles/pbsserver/tasks/main.yml index 4e24f8cc7..417bef285 100644 --- a/playbooks/roles/pbsserver/tasks/main.yml +++ b/playbooks/roles/pbsserver/tasks/main.yml @@ -117,13 +117,37 @@ args: chdir: /opt/cycle/pbspro -- name: Restart pbs-server - service: - name: pbs - state: restarted - - name: create cron entry to remove old accounting files cron: name: "remove PBS accounting files older than 90 days" special_time: daily job: "find /var/spool/pbs/server_logs -mtime +90 -type f -print -exec rm {} +" + +- name: Update Packages + include_role: + name: pkg_update + apply: + become: true + +- name: stop postgresql + service: + name: postgresql + state: stopped + +- name: stop pbs-server + service: + name: pbs + state: stopped + +- name: start postgresql + service: + name: postgresql + state: started + +- name: start pbs-server + service: + name: pbs + state: started + +- name: check pbs connection + command: qstat diff --git a/playbooks/roles/pkg_update/tasks/Ubuntu.yml b/playbooks/roles/pkg_update/tasks/Ubuntu.yml index ec65ed1a5..5747f4598 100644 --- a/playbooks/roles/pkg_update/tasks/Ubuntu.yml +++ b/playbooks/roles/pkg_update/tasks/Ubuntu.yml @@ -15,6 +15,19 @@ ansible.builtin.apt: name: "*" state: latest + # https://github.com/ansible/ansible/issues/51663 + # There has been an intermittent issue with this task where it would fail and print the error: + # + # Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), is another process + # using it? + # + # The reason for this is unclear. It's not from unattended-upgrades as that has already been + # uninstalled when creating the base image. The workaround for now is to simply retry this task + # several times in the event that it fails, with a small delay between each attempt. + register: result + until: result is not failed + retries: 5 + delay: 15 - name: Check if reboot is required stat: diff --git a/playbooks/roles/slurm/tasks/main.yml b/playbooks/roles/slurm/tasks/main.yml index a6b9df7ed..6c50d150b 100644 --- a/playbooks/roles/slurm/tasks/main.yml +++ b/playbooks/roles/slurm/tasks/main.yml @@ -14,3 +14,11 @@ - import_tasks: '{{slurm_role}}.yml' become: true + +- name: Update Packages + include_role: + name: pkg_update + apply: + become: true + vars: + packages_to_exclude_from_upgrade: "['jetpack8']" diff --git a/playbooks/roles/slurm/tasks/slurmserver.yml b/playbooks/roles/slurm/tasks/slurmserver.yml index 1a28a07d1..1ed494375 100644 --- a/playbooks/roles/slurm/tasks/slurmserver.yml +++ b/playbooks/roles/slurm/tasks/slurmserver.yml @@ -129,3 +129,4 @@ - import_tasks: pyxis.yml become: true tags: [ 'pyxis' ] + diff --git a/playbooks/scheduler.yml b/playbooks/scheduler.yml index 62f58f28a..72d8e99e1 100644 --- a/playbooks/scheduler.yml +++ b/playbooks/scheduler.yml @@ -44,10 +44,3 @@ cc_webserverpath: '{{cyclecloud.web_server_path | default("")}}' when: ( queue_manager is defined and queue_manager == "slurm" ) - - name: Update Packages - include_role: - name: pkg_update - apply: - become: true - vars: - packages_to_exclude_from_upgrade: "['jetpack8']"