Skip to content

Commit

Permalink
Merge pull request #4692 from jedwards4b/add_mem_variables
Browse files Browse the repository at this point in the history
Add mem variables
  • Loading branch information
jedwards4b authored Oct 14, 2024
2 parents 2df4aa2 + f0b5c09 commit caeb01a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 2 deletions.
27 changes: 25 additions & 2 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,35 @@ def get_job_overrides(self, job, case):
overrides["tasks_per_node"] = tasks_per_node
if thread_count:
overrides["thread_count"] = thread_count
total_tasks = total_tasks * thread_count
else:
total_tasks = total_tasks * case.thread_count
else:
total_tasks = case.get_value("TOTALPES") * int(case.thread_count)
# Total PES accounts for threads as well as mpi tasks
total_tasks = case.get_value("TOTALPES")
thread_count = case.thread_count
if int(total_tasks) * int(thread_count) < case.get_value("MAX_TASKS_PER_NODE"):
if int(total_tasks) < case.get_value("MAX_TASKS_PER_NODE"):
overrides["max_tasks_per_node"] = int(total_tasks)

# when developed this variable was only needed on derecho, but I have tried to
# make it general enough that it can be used on other systems by defining MEM_PER_TASK and MAX_MEM_PER_NODE in config_machines.xml
# and adding {{ mem_per_node }} in config_batch.xml
try:
mem_per_task = case.get_value("MEM_PER_TASK")
max_mem_per_node = case.get_value("MAX_MEM_PER_NODE")
mem_per_node = total_tasks

if mem_per_node < mem_per_task:
mem_per_node = mem_per_task
elif mem_per_node > max_mem_per_node:
mem_per_node = max_mem_per_node
overrides["mem_per_node"] = mem_per_node
except TypeError:
# ignore this, the variables are not defined for this machine
pass
except Exception as error:
print("An exception occured:", error)

overrides["ngpus_per_node"] = ngpus_per_node
overrides["mpirun"] = case.get_mpirun_cmd(job=job, overrides=overrides)
return overrides
Expand Down
9 changes: 9 additions & 0 deletions CIME/data/config/xml_schemas/config_machines.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
<xs:element name="ALLOCATE_SPARE_NODES" type="upperBoolean"/>
<xs:element name="SUPPORTED_BY" type="xs:string"/>
<xs:element name="MAX_TASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MEM_PER_TASK" type="AttrElement"/>
<xs:element name="MAX_MEM_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
Expand Down Expand Up @@ -164,6 +166,13 @@
<!-- MAX_TASKS_PER_NODE: maximum number of threads*tasks per
shared memory node on this machine-->
<xs:element ref="MAX_TASKS_PER_NODE" minOccurs="1" maxOccurs="unbounded"/>

<!-- MEM_PER_TASK: the minimum memory to assign per mpi task (units assigned in config_batch.xml) -->
<xs:element ref="MEM_PER_TASK" minOccurs="0" maxOccurs="1"/>
<!-- MAX_MEM_PER_NODE: the maximum memory to assign per machine node -->
<xs:element ref="MAX_MEM_PER_NODE" minOccurs="0" maxOccurs="1"/>


<!-- MAX_GPUS_PER_NODE: maximum number of GPUs per node on this machine-->
<xs:element ref="MAX_GPUS_PER_NODE" minOccurs="0" maxOccurs="1"/>
<!-- MAX_MPITASKS_PER_NODE: number of physical PES per shared node on
Expand Down
8 changes: 8 additions & 0 deletions CIME/data/config/xml_schemas/config_machines_version3.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
<xs:element name="ALLOCATE_SPARE_NODES" type="upperBoolean"/>
<xs:element name="SUPPORTED_BY" type="xs:string"/>
<xs:element name="MAX_TASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MEM_PER_TASK" type="AttrElement"/>
<xs:element name="MAX_MEM_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
Expand Down Expand Up @@ -181,6 +183,12 @@
<!-- MAX_TASKS_PER_NODE: maximum number of threads*tasks per
shared memory node on this machine-->
<xs:element ref="MAX_TASKS_PER_NODE" minOccurs="1" maxOccurs="unbounded"/>

<!-- MEM_PER_TASK: the minimum memory to assign per mpi task (units assigned in config_batch.xml) -->
<xs:element ref="MEM_PER_TASK" minOccurs="0" maxOccurs="1"/>
<!-- MAX_MEM_PER_NODE: the maximum memory to assign per machine node -->
<xs:element ref="MAX_MEM_PER_NODE" minOccurs="0" maxOccurs="1"/>

<!-- MAX_GPUS_PER_NODE: maximum number of GPUs per node on this machine-->
<xs:element ref="MAX_GPUS_PER_NODE" minOccurs="0" maxOccurs="1"/>
<!-- MAX_MPITASKS_PER_NODE: number of physical PES per shared node on
Expand Down

0 comments on commit caeb01a

Please sign in to comment.