Skip to content

Commit

Permalink
πŸ”§ slurm batch system improvements (#62)
Browse files Browse the repository at this point in the history
* πŸ”§ change bash env in slurm

* πŸ”§ update slurm submission

* πŸ”§ update file mode

* πŸ”§ fix slurm efficiency printout job

* βœ… update GH travis token
  • Loading branch information
juanesarango authored Jul 9, 2024
1 parent 2418b80 commit 5507962
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ env:
- secure: fkbh052mzzgDi7Bf6m/5kkSlypyLDkTYyNvRJH3O/jjVKB3b4zX1WDXnRL+/8MJEYm240g2JGCu/TdAph5t2+R9I1LmELBIjP/d/uMqKFCjA0TLSmLgxKZUg5c7cT2dFQh5z6pgcJPAgPsFrWITkTXqWmKfreA27icDqQdiFi1FksEAkrfeT3Y+WnkR0XNYzQrWnemS1QWh8wti/vGUO+znLIBw5mP8C3Da1KJiL/Uo6+zb11ddmsCJw3a27EqtfwtcGdUco59av0gVLLNwyc2DQYiOQpCRlf+z44JC0n4buq9H1FuwIpuWRGbqLn0HupfXpt9yfV79y2MfCqibc9MQFeoC50fqzTbyQGtz8cROsQazZ3N3cmEBf00uPmxCJffQedwnn1z9QmrZhFHnJGZc1crYUFOvaAqpmg78BE6xv8j7wg51oqx7eVRjJJY/EGMCQoaSZoH0FGfLyADEp/eKpbIqphxCoyPDaIuLFBEonXMaQqgyAWw94aWmv1lB6V/gGJpnBG6zkczEPup5+1LF6UTrmogVeoGgMd4+g9pWp9OUJQAnAQ/m+PG7LEmmEnHxFFshZ+EOmmFYIgWX9BqXd+ajr7DjO0zrcAF5Zmm1Ja7xVA595l+X3WvHjuNzkyFLEDZrn3mLwGeKxk+4xbVa50dGKCi4NICXOAxbv/bA=
- secure: hgGml3e2YgUkgM76iQaDkeY5k/rk7+lr1beqYkwmfqn3SmNShrJ5Q3PQpqiRi+FkYVbNgbsWMP49/MzTn6K26S9JBG0cXrnHbvJvKOYyomE/jgumwyMj5zNjqHjeDfozYRWvR6K3DtC9eafaP4NTs/EQu3gsVFWwepFkltJXNsfP3hMao/C1dy/ObxVlMTkr7RsXTYaCmSKs53Vf3TDwoP5hh2hWbgCudKk0C8hJ2znSz+sQ+3HcRklQWaG5qPnBXPGjE/gTTBV2aL9ZjNsHK1B9n4+zskFYvSZt9OAC4LfERHGm6FstEJgof3+HeOuQanxryUsb9ZPtsWv85T5vjVu27PNYmXful7qVD4iWyhsV6G6reU3YZcu/DIAnVsQAyqhdZwCq4KIh3kFJFPiSi/3E7uMBhUont5+aV+FihlK3ENNc8NdjMe0C4SEhMDZ5i470CienAIUGrAdhl4HDmoWTzh0qTuGhGJwoMTOzHAHTO8enTFbFjmSt/B2zHAkYBfI0OBX9bsOu6JUKmMWm86GItdpQfSY8mlR7wa06Gqz0/sE7GoF78tb9wgUv5/yxIOinmhDHlwHFkCr+W98q9UZsWepEMjHPe6el0puegvdc5NYnc7T7+Fu2B66yMCJbmDbcB2Hn6JWKpTk7l/pM3tFco01Go3wsKebdCIj5ASk=
- secure: dXbqdNJZzG06SsWBHTAXSlqBqKnuyLVaM0Oath9Ni/i5ZbAqCC1I750WZ3pmsQwpjHLSWVPUH7DgQ5MH0frIeaBj5ehrFBJXRmWcYQK2x0S+SIc5kVsJxIZXbLZ7wTsZS5QsrTVe8sRcdJ1XUruIzUH23ZFmBnVSQofXS0VQVEsdL94DEjh6GxGEuJtprqNKkXIAjOA9uFwNqQluJKdJI0GvkZdyTXUYwsFh5b1n9tiq0JeFs74MZet5viW+KRddBnckZBjV7MYzG7Zx4AlPrVmB+ug8+zTyNHKa2rnzehS6tbtIK01LWxK8g76uBP/i4miy/EKdBWYwcZf8RgkLqwmIYkL9NYiDk7rXqYx3q+5JKaGxkDhwjF8OSzuRbmit0/O5HWGSsDYxqXXkDJFkZhJBWBkFYDrK4vuYF+fafRn2Dn3B1Iv6Gj1GWY8LTmTkI0QtlorrEJ4kKyNAlxcnvM1G2xg6ZXOfDsdO/y0Sjl8sTdJad3XJaSttTj4HqdOJnyUzmzCcT6t8QV4Q/6qWVoCs842inYeudFpnmF4wMJADx3zzfJ5zB9VMQg/CPPy4SMaF1BITtCI+yCshbzNUqhUK2y6/Rs6AeRMOoO/FuyPSMV+GqBy2/qKEG7SSerxPYPniVizgY1NoceviWP0CkINHtuYo9TybRvpk6xP3/mE=
- secure: gs6SipJHOWK48rZzW9WDJwJqFfrB+zO7TtZ5+W5JYE9JNxpWJjG2QK89Kd16ex+n6v3bN6h0EPAUNZV3HwZwb/deoHuOHvLiH51kSYHgloFvMUBbeyKRGGGfxL7w+seFulh6IIcafjTk5ugdAF2mW5o69GArfC3fa/Bms8uIc0Pp5UGubiLFbn6oiGwFjf1o0UPVh369rEVlMjFNyRe8zSFNzroKnn6TmBvyhmPL0uPhty4GxtJ2R51wTNvOzKDm8vix3FHk5d4NZrdQwBIzSu0r4ocRISu/KhXm+L7Db5bSXVIdeeWvHJNjMFZ8wt2WkhbRCi3r9HtiI1z51+YMdTDLjmJXggXVdmMCNfD1lkaRz0gXdj8tI84Nj/gfrtRAtQxHTOt/Aa0Sm/iUL2NWkQbzU34SFWvs07mGosVcNhfVLBqSvCQowCeiHNt7yfbjZgFwuSyzLCMFTuQ+6GIclsL+E1TQT7wF3nDlfezhQecdw3+GJicBTFBY5FM98Ael+5fq+PY2vG+zuei1Ia4Eo3wD3+BEijk5i/U87Ll18iZUeRylhlVlVjGDTntTpbtGKyD3YqVcimS7tZ6zYgR/uonuH9R/aiWuSWeqlMrdT4SCwL15WujPQJJhB8gOKGHg7vkUgF7c8Fj/UMGayYewca4assAY8CMNneMLnR+eyW8=
- secure: VeRd+1WNVHeYJ59fvgmllQskwbtyITrR2v7loUCc6nagoAeSmwZfoe5/7lySIMO6/uzIJfkiaxNbzf3vatYKhDYqFd/V5Swrdu3RUxT7xGgvGpOzaOrhrHi+CPR/1dk58/bgGHmy8q8S0uesId/45LeRELEbcCRiCtlWZFz4UuHFKq0qRJv6LdFZvWFk6F/BcWq7n8fC7SGT+Krn0lf0HR3zX4UyUNPrWKe15l0h+Z05RYcwOLi5cK+sqFx0os5SU44DfYkO2jIsRhqDMJc+ipC/YZfpYxC+V/AT4CieDIcq/JFB2DHx/bnknm8OE62a1IvZJ+hbl6Uv/Khx/QnoGf3wgVZ3guKWju+SnzeZTLU2aTQcyArKmTKVD0me/M95TzjbCLECwwCrNdTGpXuqTOlnLIrjXUvpJ1Q+5EVk+8ULrYbCeYhVqJ/+iYvDUjehmBd+yW59Mgvt0eCc4IeabYpNBgiZ44UCHSt57GfH95xbZQmjkW4k8Lm1uoyRypzD8tcUgi2bkwoiK/jNaNQ562AoxUqIEPnbzw+2EkadULkRr4nianbvAgACaMZsavLBqUpchi4x6vnRaWGUiz71h9FCHS5e8eRV/UFe2AS37CDt4ycuAyumaFfTd0eQdF6w4WomYkRz+U+q7AhkZvsBorQjbGS6jI0LPUuozFi2Yno=
37 changes: 26 additions & 11 deletions isabl_cli/batch_systems/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ def submit_slurm_array(
Arguments:
commands (list): of (path to bash script, on exit command) tuples.
requirements (str): string of LSF requirements.
requirements (str): string of SLURM requirements.
jobname (str): slurm array jobname.
extra_args (str): extra LSF args.
extra_args (str): extra SLURM args.
throttle_by (int): max number of jobs running at same time.
wait (bool): if true, wait until clean command finishes.
Expand Down Expand Up @@ -122,41 +122,56 @@ def submit_slurm_array(
# submit a dependency job on failure
# important when the scheduler kills the head job
dependency = "${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}"
afternotok = (
after_not_ok_job = (
f"sbatch {extra_args} --depend=afternotok:{dependency} --kill-on-invalid-dep yes "
f'-o {join(rundir, "head_job.exit")} -J "EXIT: {dependency}" '
f'--export=TMP,TMPDIR,TMP_DIR -o {join(rundir, "head_job.exit")} -J "EXIT: {dependency}" '
f"<< EOF\n#!/bin/bash\n{exit_command}\nEOF\n"
)

# use random sleep to avoid parallel API hits
f.write(
f"#!/bin/sh\nsleep {random.uniform(0, 10):.3} && "
f"({afternotok}) && bash {command}"
f"#!/bin/bash\n\n"
f"sleep {random.uniform(0, 10):.3} && "
f"({after_not_ok_job}) && bash {command}"
)

for j in "log", "err", "exit":
for j in "log", "err", "exit", "slurm":
src = join(rundir, f"head_job.{j}")
dst = join(root, f"{j}.{index}")
open(src, "w").close()
utils.force_symlink(src, dst)

with open(join(root, "in.sh"), "w") as f:
f.write(f"#!/bin/sh\nbash {root}/in.$SLURM_ARRAY_TASK_ID")
f.write(f"#!/bin/bash\nbash {root}/in.$SLURM_ARRAY_TASK_ID")

with open(join(root, "clean.sh"), "w") as f:
f.write(f"#!/bin/sh\nrm -rf {root}")
f.write(f"#!/bin/bash\nrm -rf {root}")

# Main job array
cmd = (
f"sbatch {requirements} {extra_args} --array 1-{total}%{throttle_by} "
f"-o '{root}/log.%a' -e '{root}/err.%a' "
f'-J "ISABL: {jobname}" --parsable {root}/in.sh'
)

jobid = subprocess.check_output(cmd, shell=True).decode("utf-8").strip()

# Job to print out slurm job metrics upon main job completion
seff_jobids = []
for i in range(1, total + 1):
seff_cmd = (
f"sbatch {extra_args} --kill-on-invalid-dep=yes "
f"--dependency=afterany:{jobid}_{i} -o '{root}/slurm.{i}' -J 'SEFF: {jobname}' "
f"--wrap='seff {jobid}_{i}'"
)
seff_jobid = subprocess.check_output(seff_cmd, shell=True).decode("utf-8").strip()
seff_jobids.append(seff_jobid.split()[-1])

# Job to clean job array rundir
with open(join(root, "clean.sh"), "w") as f:
f.write(f"#!/bin/bash\nrm -rf {root}")
cmd = (
f"sbatch {extra_args} -J 'CLEAN: {jobname}' {wait} --kill-on-invalid-dep yes "
f"-o /dev/null -e /dev/null --depend=afterany:{jobid} --parsable {root}/clean.sh"
f"-o /dev/null -e /dev/null --depend=afterany:{':'.join(seff_jobids)} --parsable {root}/clean.sh"
)

return subprocess.check_output(cmd, shell=True).decode("utf-8").strip()
Empty file modified setup.py
100644 β†’ 100755
Empty file.

0 comments on commit 5507962

Please sign in to comment.