Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
fix: set AccountingStoragePass to the correct value
Browse files Browse the repository at this point in the history
  • Loading branch information
jedel1043 committed Jul 19, 2024
1 parent 54b7a46 commit c17d519
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 25 deletions.
13 changes: 7 additions & 6 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import charms.hpc_libs.v0.slurm_ops as slurm
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH
from constants import CHARM_MAINTAINED_SLURM_CONF_PARAMETERS, SLURM_CONF_PATH, SNAP_COMMON
from interface_slurmd import (
PartitionAvailableEvent,
PartitionUnavailableEvent,
Expand Down Expand Up @@ -120,6 +120,7 @@ def _on_install(self, event: InstallEvent) -> None:
self._legacy_manager.write_jwt_rsa(jwt_rsa)

self._slurmctld.enable()
self._slurmctld.munge.enable()
self._slurmctld.exporter.enable()
self.slurm_installed = True
except slurm.SlurmOpsError as e:
Expand Down Expand Up @@ -306,7 +307,7 @@ def _assemble_slurmctld_parameters() -> str:
accounting_params = {
"AccountingStorageHost": slurmdbd_host,
"AccountingStorageType": "accounting_storage/slurmdbd",
"AccountingStoragePass": "/var/run/munge/munge.socket.2",
"AccountingStoragePass": f"{SNAP_COMMON}/run/munge/munged.socket.2",
"AccountingStoragePort": "6819",
}

Expand Down Expand Up @@ -361,9 +362,9 @@ def _check_status(self) -> bool: # noqa C901

# FIXME: Returns false because systemd is looking for the
# `munge.service` file and not the snap one.
# if not self._legacy_manager.check_munged():
# self.unit.status = BlockedStatus("Error configuring munge key")
# return False
if not self._legacy_manager.check_munged():
self.unit.status = BlockedStatus("Error configuring munge key")
return False

self.unit.status = ActiveStatus("")
return True
Expand Down Expand Up @@ -405,7 +406,7 @@ def new_nodes(self, new_nodes: List[Any]) -> None:
@property
def hostname(self) -> str:
"""Return the hostname."""
return self._legacy_manager.hostname
return self._slurmctld.hostname

@property
def _slurmd_ingress_address(self) -> str:
Expand Down
22 changes: 11 additions & 11 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@

from pathlib import Path

_SNAP_COMMON = Path("/var/snap/slurm/common")
SNAP_COMMON = Path("/var/snap/slurm/common")

SLURM_USER = "root"
SLURM_GROUP = "root"
SLURM_CONF_PATH = _SNAP_COMMON / "etc/slurm/slurm.conf"
CGROUP_CONF_PATH = _SNAP_COMMON / "etc/slurm/cgroup.conf"
JWT_KEY_PATH = _SNAP_COMMON / "var/lib/slurm/slurmctld/jwt_hs256.key"
SLURM_CONF_PATH = SNAP_COMMON / "etc/slurm/slurm.conf"
CGROUP_CONF_PATH = SNAP_COMMON / "etc/slurm/cgroup.conf"
JWT_KEY_PATH = SNAP_COMMON / "var/lib/slurm/slurmctld/jwt_hs256.key"

CHARM_MAINTAINED_SLURM_CONF_PARAMETERS = {
"AuthAltParameters": f"jwt_key={JWT_KEY_PATH}",
"AuthAltTypes": "auth/jwt",
"AuthInfo": f"{_SNAP_COMMON}/run/munge/munged.socket.2",
"AuthInfo": f"{SNAP_COMMON}/run/munge/munged.socket.2",
"AuthType": "auth/munge",
"GresTypes": "gpu",
"HealthCheckInterval": "600",
Expand All @@ -27,13 +27,13 @@
"SelectType": "select/cons_tres",
"SlurmctldPort": "6817",
"SlurmdPort": "6818",
"StateSaveLocation": f"{_SNAP_COMMON}/var/lib/slurm/slurmctld",
"SlurmdSpoolDir": f"{_SNAP_COMMON}/var/lib/slurm/slurmd",
"StateSaveLocation": f"{SNAP_COMMON}/var/lib/slurm/slurmctld",
"SlurmdSpoolDir": f"{SNAP_COMMON}/var/lib/slurm/slurmd",
"SlurmctldParameters": "enable_configless",
"SlurmctldLogFile": f"{_SNAP_COMMON}/var/log/slurm/slurmctld.log",
"SlurmdLogFile": f"{_SNAP_COMMON}/var/log/slurm/slurmd.log",
"SlurmdPidFile": f"{_SNAP_COMMON}/run/slurmd.pid",
"SlurmctldPidFile": f"{_SNAP_COMMON}/run/slurmctld.pid",
"SlurmctldLogFile": f"{SNAP_COMMON}/var/log/slurm/slurmctld.log",
"SlurmdLogFile": f"{SNAP_COMMON}/var/log/slurm/slurmd.log",
"SlurmdPidFile": f"{SNAP_COMMON}/run/slurmd.pid",
"SlurmctldPidFile": f"{SNAP_COMMON}/run/slurmctld.pid",
"SlurmUser": SLURM_USER,
"SlurmdUser": "root",
"RebootProgram": '"/usr/sbin/reboot --reboot"',
Expand Down
6 changes: 3 additions & 3 deletions src/slurmctld_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,19 +67,19 @@ def generate_jwt_rsa(self) -> str:

def check_munged(self) -> bool:
"""Check if munge is working correctly."""
if not systemd.service_running("munge"):
if not systemd.service_running("snap.slurm.munged"):
return False

output = ""
# check if munge is working, i.e., can use the credentials correctly
try:
logger.debug("## Testing if munge is working correctly")
munge = subprocess.Popen(
["munge", "-n"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
["slurm.munge", "-n"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if munge is not None:
unmunge = subprocess.Popen(
["unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
["slurm.unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
output = unmunge.communicate()[0].decode()
if "Success" in output:
Expand Down
13 changes: 8 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,19 @@
from charm import SlurmctldCharm
from ops.model import BlockedStatus
from ops.testing import Harness
from charms.hpc_libs.v0.slurm_ops import SlurmOpsError

ops.testing.SIMULATE_CAN_CONNECT = True


class TestCharm(unittest.TestCase):
def setUp(self):
self.harness = Harness(SlurmctldCharm)
self.harness.set_leader(is_leader=True)
self.addCleanup(self.harness.cleanup)
self.harness.begin()

@patch("slurmctld_ops.SlurmctldManager.hostname", return_val="localhost")
@patch("charms.hpc_libs.v0.slurm_ops.SlurmManagerBase.hostname", return_val="localhost")
def test_hostname(self, hostname) -> None:
"""Test that the hostname property works."""
self.assertEqual(self.harness.charm.hostname, hostname)
Expand Down Expand Up @@ -73,16 +75,17 @@ def test_install_success(self, *_) -> None:
)

# @unittest.expectedFailure
@patch("slurmctld_ops.SlurmctldManager.install", return_value=False)
def test_install_fail(self, _) -> None:
@patch("charms.hpc_libs.v0.slurm_ops.install")
def test_install_fail(self, install) -> None:
"""Test that the on_install method works when slurmctld fails to install.
Notes:
This method is expected to fail due to the 'version' file missing.
"""
install.side_effect = SlurmOpsError("err")
self.harness.charm.on.install.emit()
self.assertEqual(
self.harness.charm.unit.status, BlockedStatus("Error installing slurmctld")
self.harness.charm.unit.status, BlockedStatus("error installing slurmctld. check log for more info")
)

def test_check_status_slurm_not_installed(self) -> None:
Expand All @@ -96,7 +99,7 @@ def test_check_status_slurm_not_installed(self) -> None:
res, False, msg="_check_status returned value True instead of expected value False."
)

@patch("slurmctld_ops.SlurmctldManager.check_munged", return_value=False)
@patch("slurmctld_ops.LegacySlurmctldManager.check_munged", return_value=False)
def test_check_status_bad_munge(self, _) -> None:
"""Test that the check_status method works when munge encounters an error."""
setattr(self.harness.charm._stored, "slurm_installed", True) # Patch StoredState
Expand Down

0 comments on commit c17d519

Please sign in to comment.