Skip to content

Commit

Permalink
feat: cpu-function altered to support cpus-per-gpu, too (#28)
Browse files Browse the repository at this point in the history
This PR works in combination with
snakemake/snakemake-executor-plugin-slurm#173,
only. It sees changes in the function to get the cpu settings.

- it is possible to ommit cpu-settins upon submission, now. Required,
because apparently some clusters do not allow this for GPU jobs (which
is crazy, but the way we do it know, should not break workflows)
- it is possible now to require CPUs using `--cpus-per-gpu`, too. (Only
for GPU jobs, of course.)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
  - Enhanced SLURM executor plugin with improved GPU job support
  - Added more flexible CPU and GPU resource allocation handling

- **Bug Fixes**
- Improved resource specification logic for different cluster
configurations
  - Better handling of CPU allocation for GPU and non-GPU jobs

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
  • Loading branch information
cmeesters authored Feb 15, 2025
1 parent 13b72f5 commit 30fecc3
Showing 1 changed file with 34 additions and 13 deletions.
47 changes: 34 additions & 13 deletions snakemake_executor_plugin_slurm_jobstep/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def __post_init__(self):
# These environment variables are set by SLURM.
# only needed for commented out jobstep handling below
self.jobid = os.getenv("SLURM_JOB_ID")
# we consider this job to be a GPU job, if a GPU has been reserved
self.gpu_job = os.getenv("SLURM_GPUS")

def run_job(self, job: JobExecutorInterface):
# Implement here how to run a job.
Expand Down Expand Up @@ -92,16 +94,12 @@ def run_job(self, job: JobExecutorInterface):
# # now: the last one
# # this way, we ensure that level jobs depending on the current level
# # get started
# jobsteps[level_list[-1]] = subprocess.Popen(
# get_call(level_list[-1], aux="--dependency=singleton"), shell=True
# )

if "mpi" in job.resources.keys():
# MPI job:
# No need to prepend `srun`, as this will happen inside of the job's shell
# command or script (!).
# The following call invokes snakemake, which in turn takes care of all
# auxilliary work around the actual command
# auxiliary work around the actual command
# like remote file support, benchmark setup, error handling, etc.
# AND there can be stuff around the srun call within the job, like any
# commands which should be executed before.
Expand All @@ -119,8 +117,8 @@ def run_job(self, job: JobExecutorInterface):
# has set the resources correctly.

call = "srun -n1 --cpu-bind=q "
call += f"--cpus-per-task {get_cpus_per_task(job)} "
call += f"{self.format_job_exec(job)}"
call += f" {get_cpu_setting(job, self.gpu_job)} "
call += f" {self.format_job_exec(job)}"

self.logger.debug(f"This job is a group job: {job.is_group()}")
self.logger.debug(f"The call for this job is: {call}")
Expand Down Expand Up @@ -155,14 +153,37 @@ def get_exec_mode(self) -> ExecMode:
return ExecMode.REMOTE


def get_cpus_per_task(job: JobExecutorInterface):
cpus_per_task = job.threads
def get_cpu_setting(job: JobExecutorInterface, gpu: bool) -> str:
# per default, we assume that Snakemake's threads are the same as the
# cpus per task or per gpu. If the user has set the cpus_per_task or
# cpus_per_gpu explicitly, we use these values.
cpus_per_task = cpus_per_gpu = job.threads
# cpus_per_task and cpus_per_gpu are mutually exclusive
if job.resources.get("cpus_per_task"):
cpus_per_task = job.resources.cpus_per_task
if not isinstance(cpus_per_task, int):
raise WorkflowError(
f"cpus_per_task must be an integer, but is {cpus_per_task}"
)
cpus_per_task = job.resources.cpus_per_task
# ensure that at least 1 cpu is requested
# because 0 is not allowed by slurm
return max(1, cpus_per_task)
# If explicetily set to < 0, return an empty string
# some clusters do not allow CPU settings (e.g. in GPU partitions).
if cpus_per_task < 0:
return ""
# ensure that at least 1 cpu is requested
# because 0 is not allowed by slurm
cpus_per_task = max(1, job.resources.cpus_per_task)
return f"--cpus-per-task={cpus_per_task}"
elif gpu and job.resources.get("cpus_per_gpu"):
cpus_per_gpu = job.resources.cpus_per_gpu
if not isinstance(cpus_per_gpu, int):
raise WorkflowError(
f"cpus_per_gpu must be an integer, but is {cpus_per_gpu}"
)
# If explicetily set to < 0, return an empty string
# some clusters do not allow CPU settings (e.g. in GPU partitions).
# Currently, 0 is not allowed by SLURM.
if cpus_per_gpu <= 0:
return ""
return f"--cpus-per-gpu={cpus_per_gpu}"
else:
return f"--cpus-per-task={cpus_per_task}"

0 comments on commit 30fecc3

Please sign in to comment.