use a generic way to apply the MPI wrapper script

ESMCI · Aug 8, 2023 · f6c42fd · f6c42fd
1 parent de5476a
commit f6c42fd
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 47 deletions.
diff --git a/CIME/case/case.py b/CIME/case/case.py
@@ -2106,19 +2106,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
             mpi_arg_string += " : "
 
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-        if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
-            if self.get_value("MACH") == "gust" or self.get_value("MACH") == "derecho":
-                mpi_arg_string = mpi_arg_string + " get_local_rank "
-            else:
-                # this wrapper script only works with OpenMPI library
-                # has been tested on Casper
-                expect(
-                    self.get_value("MPILIB") == "openmpi",
-                    "The wrapper script only works with OpenMPI library; {} is currently used".format(self.get_value("MPILIB")),
-                )
-                rundir = self.get_value("RUNDIR")
-                output_name = rundir + "/set_device_rank.sh"
-                mpi_arg_string = mpi_arg_string + " " + output_name + " "
+        if ngpus_per_node and ngpus_per_node > 0:
+            mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
+            if mpi_gpu_run_script:
+                mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script
 
         return self.get_resolved_value(
             "{} {} {} {}".format(

diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
@@ -482,31 +482,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
             caseroot=caseroot,
             is_batch=is_batch,
         )
-
-    # put the following section here to make sure the rundir is generated first
-    machdir = self.get_value("MACHDIR")
-    mach = self.get_value("MACH")
-    ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-    overrides = {}
-    overrides["ngpus_per_node"] = ngpus_per_node
-    input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
-    if os.path.isfile(input_template):
-        # update the wrapper script that sets the device id for each MPI rank
-        output_text = transform_vars(
-            open(input_template, "r").read(), case=self, overrides=overrides
-        )
-
-        # write it out to the run dir
-        rundir = self.get_value("RUNDIR")
-        output_name = os.path.join(rundir, "set_device_rank.sh")
-        logger.info("Creating file {}".format(output_name))
-        with open(output_name, "w") as f:
-            f.write(output_text)
-
-        # make the wrapper script executable
-        if os.path.isfile(output_name):
-            os.system("chmod +x " + output_name)
-        else:
-            expect(
-                False, "The file {} is not written out correctly.".format(output_name)
-            )
diff --git a/CIME/config.py b/CIME/config.py
@@ -177,11 +177,6 @@ def __init__(self):
             False,
             desc="If set to `True` then COMP_ROOT_DIR_CPL is set using UFS_DRIVER if defined.",
         )
-        self._set_attribute(
-            "gpus_use_set_device_rank",
-            True,
-            desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Derecho/Gust) is appended when the MPI run command is generated.",
-        )
         self._set_attribute(
             "test_custom_project_machine",
             "melvin",

diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -61,6 +61,7 @@
   <xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
   <xs:element name="GPU_TYPE" type="AttrElement"/>
   <xs:element name="GPU_OFFLOAD" type="AttrElement"/>
+  <xs:element name="MPI_GPU_WRAPPER_SCRIPT" type="AttrElement"/>
   <xs:element name="COSTPES_PER_NODE" type="xs:integer"/>
   <xs:element name="PROJECT_REQUIRED" type="xs:NCName"/>
   <xs:element name="executable" type="xs:string"/>
@@ -178,6 +179,9 @@
         <xs:element ref="GPU_TYPE" minOccurs="0" maxOccurs="unbounded"/>
 	<!-- GPU_OFFLOAD: the GPU programming model used for GPU porting -->
         <xs:element ref="GPU_OFFLOAD" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- MPI_GPU_WRAPPER_SCRIPT: a wrapper script that will be attached to the MPI run
+	     command and map different MPI ranks to different GPUs within the same node -->
+        <xs:element ref="MPI_GPU_WRAPPER_SCRIPT" minOccurs="0" maxOccurs="1"/>
         <!-- Optional cost factor per node unit -->
         <xs:element ref="COSTPES_PER_NODE" minOccurs="0" maxOccurs="1"/>
         <!-- PROJECT_REQUIRED: Does this machine require a project to be specified to

diff --git a/doc/source/users_guide/cime-customize.rst b/doc/source/users_guide/cime-customize.rst
@@ -44,7 +44,6 @@ default_short_term_archiving       True                     bool   If set to `Tr
 driver_choices                     ('mct', 'nuopc')         tuple  Sets the available driver choices for the model.
 driver_default                     nuopc                    str    Sets the default driver for the model.
 enable_smp                         True                     bool   If set to `True` then `SMP=` is added to model compile command.
-gpus_use_set_device_rank           True                     bool   If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Derecho/Gust) is appended when the MPI run command is generated.
 make_case_run_batch_script         False                    bool   If set to `True` and case is not a test then `case.run.sh` is created in case directory from `$MACHDIR/template.case.run.sh`.
 mct_path                           {srcroot}/libraries/mct  str    Sets the path to the mct library.
 serialize_sharedlib_builds         True                     bool   If set to `True` then the TestScheduler will use `proc_pool + 1` processors to build shared libraries otherwise a single processor is used.