From e3f297839ba3b2cf6b6ad5c8ed5c7030818af4ac Mon Sep 17 00:00:00 2001 From: Justin Hiemstra Date: Wed, 4 Sep 2024 12:30:27 -0500 Subject: [PATCH] Add wrapper script for running long Snakemake workflows with HTCondor To avoid tying Snakemake execution to the current terminal (which is bad if you ever want to log out of the AP or let your computer fall asleep), the new script `snakemake_long.py` wraps Snakemake execution in an HTCondor local universe job. This job is submitted like a regular HTCondor job, but it runs on the AP in the context of the submit directory. Usage: snakemake_long.py --snakefile (OPTIONAL) --profile (REQUIRED) --htcondor-jobdir (OPTIONAL) The only CLI option that might feel new here is `htcondor-jobdir`. This is actually the same CLI used by the HTCondor executor, and it specifies a directory in which logs are placed. I chose to keep names the same with this script so it feels more familiar to Snakemake users. --- docker-wrappers/SPRAS/README.md | 19 ++- docker-wrappers/SPRAS/example_config.yaml | 2 + docker-wrappers/SPRAS/snakemake_long.py | 134 ++++++++++++++++++ .../SPRAS/spras_profile/config.yaml | 3 +- 4 files changed, 155 insertions(+), 3 deletions(-) create mode 100755 docker-wrappers/SPRAS/snakemake_long.py diff --git a/docker-wrappers/SPRAS/README.md b/docker-wrappers/SPRAS/README.md index a163e2b8..ef0ffb87 100644 --- a/docker-wrappers/SPRAS/README.md +++ b/docker-wrappers/SPRAS/README.md @@ -68,6 +68,8 @@ git clone https://github.com/Reed-CompBio/spras.git **Note:** To work with SPRAS in HTCondor, it is recommended that you build an Apptainer image instead of using Docker. See [Converting Docker Images to Apptainer/Singularity Images](#converting-docker-images-to-apptainersingularity-images) for instructions. Importantly, the Apptainer image must be built for the linux/amd64 architecture. Most HTCondor APs will have `apptainer` installed, but they may not have `docker`. If this is the case, you can build the image with Docker on your local machine, push the image to Docker Hub, and then convert it to Apptainer's `sif` format on the AP. +**Note:** It is best practice to make sure that the Snakefile you copy for your workflow is the same version as the Snakefile baked into your workflow's container image. When this workflow runs, the Snakefile you just copied will be used during remote execution instead of the Snakefile from the container. As a result, difficult-to-diagnose versioning issues may occur if the version of SPRAS in the remote container doesn't support the Snakefile on your current branch. The safest bet is always to create your own image so you always know what's inside of it. + There are currently two options for running SPRAS with HTCondor. The first is to submit all SPRAS jobs to a single remote Execution Point (EP). The second is to use the Snakemake HTCondor executor to parallelize the workflow by submitting each job to its own EP. @@ -104,13 +106,26 @@ cp ../../Snakefile . && \ cp -r ../../input . ``` -**Note:** It is best practice to make sure that the Snakefile you copy for your workflow is the same version as the Snakefile baked into your workflow's container image. When this workflow runs, the Snakefile you just copied will be used during remote execution instead of the Snakefile from the container. As a result, difficult-to-diagnose versioning issues may occur if the version of SPRAS in the remote container doesn't support the Snakefile on your current branch. The safest bet is always to create your own image so you always know what's inside of it. +Instead of editing `spras.sub` to define the workflow, this scenario requires editing the SPRAS profile in `spras_profile/config.yaml`. Make sure you specify the correct container, and change any other config values needed by your workflow (defaults are fine in most cases). -To start the workflow with HTCondor in the CHTC pool, run: +Then, to start the workflow with HTCondor in the CHTC pool, there are two options: + +#### Snakemake From Your Own Terminal +The first option is to run Snakemake in a way that ties its execution to your terminal. This is good for testing short workflows and running short jobs. The downside is that closing your terminal causes the process to exit, removing any unfinished jobs. To use this option, invoke Snakemake directly by running: ```bash snakemake --profile spras_profile ``` +#### Long Running Snakemake Jobs (Managed by HTCondor) +The second option is to let HTCondor manage the Snakemake process, which allows the jobs to run as long as needed. Instead of seeing Snakemake output directly in your terminal, you'll be able to see it in a specified log file. To use this option, make sure `snakemake_long.py` is executable (you can run `chmod +x snakemake_long.py` from the AP to make sure it is), and then run: +``` +./snakemake_long.py --profile spras_profile --htcondor-jobdir +``` + +When run in this mode, all log files for the workflow will be placed into the path you provided for the logging directory. In particular, Snakemake's outputs with job progress can be found split between `/snakemake-long.err` and `/snakemake-long.out`. + +### Adjusting Resources + Resource requirements can be adjusted as needed in `spras_profile/config.yaml`, and HTCondor logs for this workflow can be found in `.snakemake/htcondor`. You can set a different log directory by adding `htcondor-jobdir: /path/to/dir` to the profile's configuration. diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index 8b9c1edb..3b6264b1 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -149,3 +149,5 @@ analysis: linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + evaluation: + include: false diff --git a/docker-wrappers/SPRAS/snakemake_long.py b/docker-wrappers/SPRAS/snakemake_long.py new file mode 100755 index 00000000..787f30c8 --- /dev/null +++ b/docker-wrappers/SPRAS/snakemake_long.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +""" +A wrapper script that allows long-term Snakemake workflows to run on HTCondor. This works +by submitting a local universe job responsible for overseeing the terminal session that +runs the actual snakemake executable. +""" + +import argparse +import os +import pathlib +import subprocess +import sys +import time + +import htcondor + +""" +Parse various arguments for the script. Note that this script has two "modes" of operation which +need different arguments. The "top" mode is for submitting the HTCondor wrapper, and the "long" mode +is for running the Snakemake command itself. +""" +def parseArgs(isLocal=False): + parser = argparse.ArgumentParser(description="A tool for long-running Snakemake jobs with HTCondor.") + # We add a special command to trigger allowing this script to execute the long-running Snakemake command. + if isLocal: + parser.add_argument("command", help="Helper command to run", choices=["long"]) + parser.add_argument("--snakefile", help="The Snakefile to run.", required=False) + parser.add_argument("--profile", help="A path to a directory containing the desired Snakemake profile.", required=True) + # I'd love to change this to "logdir", but using the same name as Snakemake for consistency of feeling between this script + # and Snakemake proper. + parser.add_argument("--htcondor-jobdir", help="The directory Snakemake will write logs to.", required=False) + return parser.parse_args() + +""" +Given a Snakefile, profile, and HTCondor job directory, submit a local universe job that runs +Snakemake from the context of the submission directory. +""" +def submitLocal(snakefile, profile, htcondor_jobdir): + # Get the location of this script, which also serves as the executable for the condor job. + script_location = pathlib.Path(__file__).resolve() + + submit_description = htcondor.Submit({ + "executable": script_location, + "arguments": f"long --snakefile {snakefile} --profile {profile} --htcondor-jobdir {htcondor_jobdir}", + "universe": "local", + "request_disk": "512MB", + "request_cpus": 1, + "request_memory": 512, + + # Set up logging + "log": f"{htcondor_jobdir}/snakemake-long.log", + "output": f"{htcondor_jobdir}/snakemake-long.out", + "error": f"{htcondor_jobdir}/snakemake-long.err", + + # Specify `getenv` so that our script uses the appropriate environment + # when it runs in local universe. This allows the job to access + # modules we've installed in the submission environment (notably spras). + "getenv": "true", + + "JobBatchName": f"spras-long-{time.strftime('%Y%m%d-%H%M%S')}", + }) + + schedd = htcondor.Schedd() + submit_result = schedd.submit(submit_description) + + print("Snakemake management job was submitted with JobID %d.0. Logs can be found in %s" % (submit_result.cluster(), htcondor_jobdir)) + +""" +The top level function for the script that handles file creation/validation and triggers submission of the +wrapper job. +""" +def topMain(): + args = parseArgs() + + # Check if the snakefile is provided. If not, assume it's in the current directory. + if args.snakefile is None: + cwd = os.getcwd() + args.snakefile = pathlib.Path(cwd) / "Snakefile" + if not os.path.exists(args.snakefile): + print(f"Error: The Snakefile {args.snakefile} does not exist.") + return 1 + + # Make sure the profile directory exists. It's harder to check if it's a valid profile at this level + # so that will be left to Snakemake. + if not os.path.exists(args.profile): + print(f"Error: The profile directory {args.profile} does not exist.") + return 1 + + # Make sure we have a value for the log directory and that the directory exists. + if args.htcondor_jobdir is None: + args.htcondor_jobdir = pathlib.Path(os.getcwd()) / "snakemake-long-logs" + if not os.path.exists(args.htcondor_jobdir): + os.makedirs(args.htcondor_jobdir) + else: + if not os.path.exists(args.htcondor_jobdir): + os.makedirs(args.htcondor_jobdir) + + + submitLocal(args.snakefile, args.profile, args.htcondor_jobdir) + return 0 + +""" + +""" +def longMain(): + args = parseArgs(True) + + # Command to activate conda environment and run Snakemake. Note that we need to unset APPTAINER_CACHEDIR + # in this case but not in the local terminal case because the wrapper HTCondor job has a different environment + # and populating this value causes Snakemake to fail when it tries to write to spool (a read-only filesystem from + # the perspective of the EP job). + command = f""" + source $(conda info --base)/etc/profile.d/conda.sh && \ + conda activate spras && \ + unset APPTAINER_CACHEDIR && \ + snakemake -s {args.snakefile} --profile {args.profile} --htcondor-jobdir {args.htcondor_jobdir} + """ + + # Run the command in a single shell session + result = subprocess.run(command, shell=True, executable='/bin/bash') + + # Return 0 for success and 1 for failure + return 0 if result.returncode == 0 else 1 + +def main(): + if len(sys.argv) > 1: + if sys.argv[1] in ["long"]: + return longMain() + + return topMain() + +if __name__ == '__main__': + sys.exit(main()) diff --git a/docker-wrappers/SPRAS/spras_profile/config.yaml b/docker-wrappers/SPRAS/spras_profile/config.yaml index 0cfb2bba..04aca57d 100644 --- a/docker-wrappers/SPRAS/spras_profile/config.yaml +++ b/docker-wrappers/SPRAS/spras_profile/config.yaml @@ -15,7 +15,8 @@ shared-fs-usage: none default-resources: job_wrapper: "spras.sh" # If running in CHTC, this only works with apptainer images - container_image: "spras.sif" + # Note requirement for quotes around the image name + container_image: "'spras-v0.2.0.sif'" universe: "container" # The value for request_disk should be large enough to accommodate the runtime container # image, any additional PRM container images, and your input data.