argonne-lcf · fbhuiyan2 · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/balsam/config/defaults/alcf_crux/job-template.sh b/balsam/config/defaults/alcf_crux/job-template.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#PBS -l select={{ num_nodes }}:system=crux,place=scatter
+#PBS -l walltime={{ wall_time_min//60 | int }}:{{ wall_time_min | int }}:00
+#PBS -l filesystems=home:grand:eagle
+#PBS -A {{ project }}
+#PBS -q {{ queue }}
+
+export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
+export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
+export http_proxy=http://proxy.alcf.anl.gov:3128
+export https_proxy=http://proxy.alcf.anl.gov:3128
+export ftp_proxy=http://proxy.alcf.anl.gov:3128
+
+#remove export PMI_NO_FORK=1
+export BALSAM_SITE_PATH={{balsam_site_path}}
+cd $BALSAM_SITE_PATH
+
+echo "Starting balsam launcher at $(date)"
+{{launcher_cmd}} -j {{job_mode}} -t {{wall_time_min - 2}}  \
+{% for k, v in filter_tags.items() %} --tag {{k}}={{v}} {% endfor %} \
+{{partitions}}
+echo "Balsam launcher done at $(date)"
diff --git a/balsam/config/defaults/alcf_crux/settings.yml b/balsam/config/defaults/alcf_crux/settings.yml
@@ -0,0 +1,21 @@
+title: "Crux (ALCF)"
+
+compute_node: balsam.platform.compute_node.CruxNode
+mpi_app_launcher: balsam.platform.app_run.CruxRun
+local_app_launcher: balsam.platform.app_run.LocalAppRun
+mpirun_allows_node_packing: true
+
+serial_mode_startup_params: 
+    cpu_affinity: none
+
+scheduler_class: balsam.platform.scheduler.PBSScheduler
+allowed_queues:
+    workq-route:
+        max_nodes: 512
+        max_queued_jobs: 20
+        max_walltime: 1440
+
+allowed_projects:
+- datascience
+
+optional_batch_job_params: {}
diff --git a/balsam/config/defaults/alcf_sophia/job-template.sh b/balsam/config/defaults/alcf_sophia/job-template.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#PBS -l select={{ num_nodes }}:ncpus=1
+#PBS -l walltime={{ wall_time_min//60 | int }}:{{ wall_time_min | int }}:00
+#PBS -l filesystems=home:grand:eagle
+#PBS -A {{ project }}
+#PBS -q {{ queue }}
+
+export http_proxy="http://proxy:3128"
+export https_proxy="http://proxy:3128"
+
+# Load required modules
+module load compilers/openmpi/5.0.3
+
+#remove export PMI_NO_FORK=1
+export BALSAM_SITE_PATH={{balsam_site_path}}
+cd $BALSAM_SITE_PATH
+
+echo "Starting balsam launcher at $(date)"
+{{launcher_cmd}} -j {{job_mode}} -t {{wall_time_min - 2}}  \
+{% for k, v in filter_tags.items() %} --tag {{k}}={{v}} {% endfor %} \
+{{partitions}}
+echo "Balsam launcher done at $(date)"
diff --git a/balsam/config/defaults/alcf_sophia/settings.yml b/balsam/config/defaults/alcf_sophia/settings.yml
@@ -0,0 +1,35 @@
+title: "Sophia (ALCF)"
+
+compute_node: balsam.platform.compute_node.SophiaNode
+mpi_app_launcher: balsam.platform.app_run.SophiaRun
+local_app_launcher: balsam.platform.app_run.LocalAppRun
+mpirun_allows_node_packing: true
+
+serial_mode_startup_params: 
+    cpu_affinity: none
+
+scheduler_class: balsam.platform.scheduler.PBSScheduler
+allowed_queues:
+    by-gpu:
+      max_nodes: 1
+      max_walltime: 720
+      min_walltime: 5
+      max_queued_jobs: 20
+    by-node:
+      max_nodes: 8
+      max_walltime: 720
+      min_walltime: 5
+      max_queued_jobs: 20
+    bigmem:
+      max_nodes: 1
+      max_walltime: 720
+      min_walltime: 5
+      max_queued_jobs: 20
+
+allowed_projects:
+- datascience
+
+optional_batch_job_params:
+    mig_count: "0"  # Use -x mig_count='2' (or '3' or '7') to split GPUs N-ways
+
+globus_endpoint_id: 08925f04-569f-11e7-bef8-22000b9a448b # The local Globus endpoint ID
diff --git a/balsam/platform/app_run/__init__.py b/balsam/platform/app_run/__init__.py
@@ -8,6 +8,8 @@
 from .summit import SummitJsrun
 from .theta import ThetaAprun
 from .theta_gpu import ThetaGPURun
+from .sophia import SophiaRun
+from .crux import CruxRun
 
 __all__ = [
     "AppRun",
@@ -21,4 +23,6 @@
     "SummitJsrun",
     "AuroraRun",
     "PerlmutterRun",
+    "SophiaRun",
+    "CruxRun",
 ]
diff --git a/balsam/platform/app_run/crux.py b/balsam/platform/app_run/crux.py
@@ -0,0 +1,70 @@
+import logging
+import os
+
+from balsam.platform.compute_node import ComputeNode
+
+from .app_run import SubprocessAppRun
+
+logger = logging.getLogger(__name__)
+
+
+class CruxRun(SubprocessAppRun):
+  """
+  Implements application launch for the Crux system.
+
+  This class constructs the appropriate command line for launching applications
+  using `mpiexec`, tailored for the Crux hardware and scheduler.
+
+  Crux Specifications:
+  - CPU-only system with dual AMD EPYC 7742 64-Core Processors per node.
+  - Each core supports up to two hyperthreads (total 256 threads per node).
+  - Uses PBS scheduler for job management.
+
+  Example mpiexec command from Crux submission script:
+  mpiexec -n total_ranks --ppn ranks_per_node --depth=depth --cpu-bind depth \
+      --env OMP_NUM_THREADS=num_threads --env OMP_PROC_BIND=true --env OMP_PLACES=cores \
+      executable
+  """
+
+  def _build_cmdline(self) -> str:
+    node_hostnames = [h for h in self._node_spec.hostnames]
+    ntasks = self.get_num_ranks()
+    nranks_per_node = self._ranks_per_node
+    nthreads = self._threads_per_rank
+    cpus_per_rank = self.get_cpus_per_rank()
+    cpu_bind = self._launch_params.get("cpu_bind", "depth")
+
+    depth = nthreads
+    if cpu_bind == "core":
+        depth = cpus_per_rank  
+
+    mpi_args = [
+        "mpiexec",
+        "-n", ntasks,
+        "--ppn", nranks_per_node,
+        "--hosts", ",".join(node_hostnames),
+        "--depth", depth,
+        "--cpu-bind", cpu_bind,
+    ]
+
+    # Add any additional launch parameters
+    for key, value in self._launch_params.items():
+        if key not in ["--ppn", "ppn", "--cpu-bind", "cpu-bind", "--depth", "depth"]:
+          mpi_args.append(str(key))
+          if value: # if value is not empty; like the flag --verbose has no value
+            mpi_args.append(value)
+
+    mpi_args.append(self._cmdline)
+
+    cmd = " ".join(str(arg) for arg in mpi_args)
+    return cmd
+
+  def _set_envs(self) -> None:
+    envs = os.environ.copy()
+    envs.update(self._envs)
+    # Note app_run.py handles setting omp_num_threads (envs["OMP_NUM_THREADS"] = str(self._threads_per_rank), line 159)
+    envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)
+    envs["OMP_PROC_BIND"] = "true"
+    envs["OMP_PLACES"] = "cores"
+    self._envs = envs
+
diff --git a/balsam/platform/app_run/sophia.py b/balsam/platform/app_run/sophia.py
@@ -0,0 +1,31 @@
+import os
+
+from .app_run import SubprocessAppRun
+
+
+class SophiaRun(SubprocessAppRun):
+    """
+    https://www.open-mpi.org/doc/v3.0/man1/mpiexec.1.php
+    """
+
+    def _build_cmdline(self) -> str:
+        node_ids = [h for h in self._node_spec.hostnames]
+        env_args = [("-x", var) for var in self._envs.keys()]
+        nid_str = ",".join(map(str, node_ids))
+        hostfile = os.environ["PBS_NODEFILE"]
+        args = [
+            "mpirun",
+            "-hostfile",
+            hostfile,
+            "--oversubscribe",
+            "--bind-to none",
+            "-n",
+            self.get_num_ranks(),
+            "-npernode",
+            str(self._ranks_per_node),
+            *[arg for pair in env_args for arg in pair],
+            "--host",
+            nid_str,
+            self._cmdline,
+        ]
+        return " ".join(str(arg) for arg in args)
diff --git a/balsam/platform/compute_node/__init__.py b/balsam/platform/compute_node/__init__.py
@@ -7,6 +7,8 @@
 from .default import DefaultNode
 from .nersc_perlmutter import PerlmutterNode
 from .summit_node import SummitNode
+from .alcf_sophia_node import SophiaNode
+from .alcf_crux_node import CruxNode
 
 __all__ = [
     "DefaultNode",
@@ -17,5 +19,7 @@
     "PerlmutterNode",
     "PolarisNode",
     "AuroraNode",
+    "SophiaNode",
+    "CruxNode",
     "ComputeNode",
 ]
diff --git a/balsam/platform/compute_node/alcf_crux_node.py b/balsam/platform/compute_node/alcf_crux_node.py
@@ -0,0 +1,46 @@
+import logging
+import os
+from typing import List, Optional, Union
+
+from .compute_node import ComputeNode
+
+logger = logging.getLogger(__name__)
+IntStr = Union[int, str]
+
+
+class CruxNode(ComputeNode):
+    cpu_ids = list(range(128))  # Crux has 128 CPU cores
+    # No need to define gpu_ids; it will default to [] from ComputeNode
+
+    @classmethod
+    def get_job_nodelist(cls) -> List["CruxNode"]:
+        """
+        Get all compute nodes allocated in the current job context.
+        """
+        nodefile = os.environ.get("PBS_NODEFILE")
+        if not nodefile or not os.path.exists(nodefile):
+            logger.error("PBS_NODEFILE environment variable is not set or file does not exist.")
+            return []
+
+        # Read hostnames from the nodefile
+        with open(nodefile) as fp:
+            hostnames = [line.strip() for line in fp if line.strip()]
+
+        node_ids: Union[List[str], List[int]] = hostnames[:]
+        node_list = []
+        for nid, hostname in zip(node_ids, hostnames):
+            # Since Crux does not have GPUs, no need to pass gpu_ids
+            node_list.append(cls(nid, hostname))
+        return node_list
+
+    @staticmethod
+    def get_scheduler_id() -> Optional[int]:
+        job_id_str = os.environ.get("PBS_JOBID")
+        if job_id_str is not None:
+            try:
+                return int(job_id_str.split(".")[0])
+            except ValueError:
+                logger.error(f"Unable to parse PBS_JOBID: {job_id_str}")
+                return None
+        return None
+
diff --git a/balsam/platform/compute_node/alcf_sophia_node.py b/balsam/platform/compute_node/alcf_sophia_node.py
@@ -0,0 +1,57 @@
+import logging
+import os
+from pathlib import Path
+from typing import List, Union, Optional
+
+from balsam.platform.compute_node import ComputeNode
+
+logger = logging.getLogger(__name__)
+IntStr = Union[int, str]
+
+
+class SophiaNode(ComputeNode):
+    # Replace these with the actual number of CPUs and GPUs per node on Sophia
+    cpu_ids = list(range(128))       # 128 CPUs per node  # How do we add hyperthreading?
+    gpu_ids: List[IntStr] = list(range(8))  # Example: 8 GPUs per node
+
+    @classmethod
+    def get_job_nodelist(cls) -> List["SophiaNode"]:
+        """
+        Get all compute nodes allocated in the current job context on Sophia
+        """
+        nodefile = os.environ.get("PBS_NODEFILE")
+        if not nodefile:
+            raise EnvironmentError("PBS_NODEFILE environment variable is not set.")
+        with open(nodefile) as fp:
+            data = fp.read()
+        splitter = "," if "," in data else None
+        hostnames = data.split(splitter)
+        hostnames = [h.strip() for h in hostnames if h.strip()]
+        node_ids: Union[List[str], List[int]]
+        node_ids = hostnames[:]
+        node_list = []
+        for nid, hostname in zip(node_ids, hostnames):
+            gpu_ids = cls.discover_gpu_list(hostname)
+            assert isinstance(nid, str) or isinstance(nid, int)
+            node_list.append(cls(nid, hostname, gpu_ids=gpu_ids))
+        return node_list
+
+    @classmethod
+    def discover_gpu_list(cls, hostname: str) -> List[IntStr]:
+        gpu_file = Path(f"/var/tmp/balsam-{hostname}-gpulist.txt")
+        gpu_ids: List[IntStr]
+        if gpu_file.is_file():
+            tokens = gpu_file.read_text().split()
+            gpu_ids = [t[:-1] for t in tokens if t.startswith("MIG-GPU-")]
+        else:
+            gpu_ids = cls.gpu_ids
+        logger.info(f"{hostname} detected GPU IDs: {gpu_ids}")
+        return gpu_ids
+
+    @staticmethod
+    def get_scheduler_id() -> Optional[int]:
+        job_id = os.environ.get("PBS_JOBID")
+        if job_id is not None:
+            # PBS_JOBID might include a ".hostname" suffix; strip it off
+            return int(job_id.split('.')[0])
+        return None
diff --git a/how to add Sophia to Balsam default configs.md b/how to add Sophia to Balsam default configs.md
@@ -0,0 +1,9 @@
+To add Sophia to Balsam default configs, you have to add the following files:
+
+- add sophia.py in platform/app_run and update the __init__.py in the folder
+- add alcf_sophia_node.py in platform/compute_node and update the __init__.py in the folder
+- add alcf_sophia folder with proper job-template.sh and settings.yml files to the config/defaults folder
+
+The files are in my forked repo (fbhuiyan2-patch-1). Once these adjustments are made, Balsam will show Sophia as an option when opening a new site.
+
+I have used the Sophia configuration to carry out VASP, LAMMPS, and Python jobs. Jobs are executed properly, node packing also works as expected.