From c17d51901c36b5b47498d100d01df75764fad54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Fri, 19 Jul 2024 17:17:46 -0600 Subject: [PATCH] fix: set `AccountingStoragePass` to the correct value --- src/charm.py | 13 +++++++------ src/constants.py | 22 +++++++++++----------- src/slurmctld_ops.py | 6 +++--- tests/unit/test_charm.py | 13 ++++++++----- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/charm.py b/src/charm.py index 315fad2..5d630dc 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,7 +11,7 @@ import charms.hpc_libs.v0.slurm_ops as slurm from charms.grafana_agent.v0.cos_agent import COSAgentProvider -from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH +from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH, SNAP_COMMON from interface_slurmd import ( PartitionAvailableEvent, PartitionUnavailableEvent, @@ -120,6 +120,7 @@ def _on_install(self, event: InstallEvent) -> None: self._legacy_manager.write_jwt_rsa(jwt_rsa) self._slurmctld.enable() + self._slurmctld.munge.enable() self._slurmctld.exporter.enable() self.slurm_installed = True except slurm.SlurmOpsError as e: @@ -306,7 +307,7 @@ def _assemble_slurmctld_parameters() -> str: accounting_params = { "AccountingStorageHost": slurmdbd_host, "AccountingStorageType": "accounting_storage/slurmdbd", - "AccountingStoragePass": "/var/run/munge/munge.socket.2", + "AccountingStoragePass": f"{SNAP_COMMON}/run/munge/munged.socket.2", "AccountingStoragePort": "6819", } @@ -361,9 +362,9 @@ def _check_status(self) -> bool: # noqa C901 # FIXME: Returns false because systemd is looking for the # `munge.service` file and not the snap one. - # if not self._legacy_manager.check_munged(): - # self.unit.status = BlockedStatus("Error configuring munge key") - # return False + if not self._legacy_manager.check_munged(): + self.unit.status = BlockedStatus("Error configuring munge key") + return False self.unit.status = ActiveStatus("") return True @@ -405,7 +406,7 @@ def new_nodes(self, new_nodes: List[Any]) -> None: @property def hostname(self) -> str: """Return the hostname.""" - return self._legacy_manager.hostname + return self._slurmctld.hostname @property def _slurmd_ingress_address(self) -> str: diff --git a/src/constants.py b/src/constants.py index e61d16b..8deec02 100644 --- a/src/constants.py +++ b/src/constants.py @@ -5,18 +5,18 @@ from pathlib import Path -_SNAP_COMMON = Path("/var/snap/slurm/common") +SNAP_COMMON = Path("/var/snap/slurm/common") SLURM_USER = "root" SLURM_GROUP = "root" -SLURM_CONF_PATH = _SNAP_COMMON / "etc/slurm/slurm.conf" -CGROUP_CONF_PATH = _SNAP_COMMON / "etc/slurm/cgroup.conf" -JWT_KEY_PATH = _SNAP_COMMON / "var/lib/slurm/slurmctld/jwt_hs256.key" +SLURM_CONF_PATH = SNAP_COMMON / "etc/slurm/slurm.conf" +CGROUP_CONF_PATH = SNAP_COMMON / "etc/slurm/cgroup.conf" +JWT_KEY_PATH = SNAP_COMMON / "var/lib/slurm/slurmctld/jwt_hs256.key" CHARM_MAINTAINED_SLURM_CONF_PARAMETERS = { "AuthAltParameters": f"jwt_key={JWT_KEY_PATH}", "AuthAltTypes": "auth/jwt", - "AuthInfo": f"{_SNAP_COMMON}/run/munge/munged.socket.2", + "AuthInfo": f"{SNAP_COMMON}/run/munge/munged.socket.2", "AuthType": "auth/munge", "GresTypes": "gpu", "HealthCheckInterval": "600", @@ -27,13 +27,13 @@ "SelectType": "select/cons_tres", "SlurmctldPort": "6817", "SlurmdPort": "6818", - "StateSaveLocation": f"{_SNAP_COMMON}/var/lib/slurm/slurmctld", - "SlurmdSpoolDir": f"{_SNAP_COMMON}/var/lib/slurm/slurmd", + "StateSaveLocation": f"{SNAP_COMMON}/var/lib/slurm/slurmctld", + "SlurmdSpoolDir": f"{SNAP_COMMON}/var/lib/slurm/slurmd", "SlurmctldParameters": "enable_configless", - "SlurmctldLogFile": f"{_SNAP_COMMON}/var/log/slurm/slurmctld.log", - "SlurmdLogFile": f"{_SNAP_COMMON}/var/log/slurm/slurmd.log", - "SlurmdPidFile": f"{_SNAP_COMMON}/run/slurmd.pid", - "SlurmctldPidFile": f"{_SNAP_COMMON}/run/slurmctld.pid", + "SlurmctldLogFile": f"{SNAP_COMMON}/var/log/slurm/slurmctld.log", + "SlurmdLogFile": f"{SNAP_COMMON}/var/log/slurm/slurmd.log", + "SlurmdPidFile": f"{SNAP_COMMON}/run/slurmd.pid", + "SlurmctldPidFile": f"{SNAP_COMMON}/run/slurmctld.pid", "SlurmUser": SLURM_USER, "SlurmdUser": "root", "RebootProgram": '"/usr/sbin/reboot --reboot"', diff --git a/src/slurmctld_ops.py b/src/slurmctld_ops.py index 9d90194..5435829 100644 --- a/src/slurmctld_ops.py +++ b/src/slurmctld_ops.py @@ -67,7 +67,7 @@ def generate_jwt_rsa(self) -> str: def check_munged(self) -> bool: """Check if munge is working correctly.""" - if not systemd.service_running("munge"): + if not systemd.service_running("snap.slurm.munged"): return False output = "" @@ -75,11 +75,11 @@ def check_munged(self) -> bool: try: logger.debug("## Testing if munge is working correctly") munge = subprocess.Popen( - ["munge", "-n"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["slurm.munge", "-n"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) if munge is not None: unmunge = subprocess.Popen( - ["unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["slurm.unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) output = unmunge.communicate()[0].decode() if "Success" in output: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index a8fe7e3..0453cdd 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -22,6 +22,7 @@ from charm import SlurmctldCharm from ops.model import BlockedStatus from ops.testing import Harness +from charms.hpc_libs.v0.slurm_ops import SlurmOpsError ops.testing.SIMULATE_CAN_CONNECT = True @@ -29,10 +30,11 @@ class TestCharm(unittest.TestCase): def setUp(self): self.harness = Harness(SlurmctldCharm) + self.harness.set_leader(is_leader=True) self.addCleanup(self.harness.cleanup) self.harness.begin() - @patch("slurmctld_ops.SlurmctldManager.hostname", return_val="localhost") + @patch("charms.hpc_libs.v0.slurm_ops.SlurmManagerBase.hostname", return_val="localhost") def test_hostname(self, hostname) -> None: """Test that the hostname property works.""" self.assertEqual(self.harness.charm.hostname, hostname) @@ -73,16 +75,17 @@ def test_install_success(self, *_) -> None: ) # @unittest.expectedFailure - @patch("slurmctld_ops.SlurmctldManager.install", return_value=False) - def test_install_fail(self, _) -> None: + @patch("charms.hpc_libs.v0.slurm_ops.install") + def test_install_fail(self, install) -> None: """Test that the on_install method works when slurmctld fails to install. Notes: This method is expected to fail due to the 'version' file missing. """ + install.side_effect = SlurmOpsError("err") self.harness.charm.on.install.emit() self.assertEqual( - self.harness.charm.unit.status, BlockedStatus("Error installing slurmctld") + self.harness.charm.unit.status, BlockedStatus("error installing slurmctld. check log for more info") ) def test_check_status_slurm_not_installed(self) -> None: @@ -96,7 +99,7 @@ def test_check_status_slurm_not_installed(self) -> None: res, False, msg="_check_status returned value True instead of expected value False." ) - @patch("slurmctld_ops.SlurmctldManager.check_munged", return_value=False) + @patch("slurmctld_ops.LegacySlurmctldManager.check_munged", return_value=False) def test_check_status_bad_munge(self, _) -> None: """Test that the check_status method works when munge encounters an error.""" setattr(self.harness.charm._stored, "slurm_installed", True) # Patch StoredState