Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔧 patch for v3.24.0 latest for python 2.7 #1

Open
wants to merge 20 commits into
base: releases/3.24.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e25c888
🔧 add slurm updates from v7.0.0 on the latest py2 working version
juanesarango May 30, 2024
e1cb5b9
⚡️ attempt to add retry with OOM error
juanesarango May 30, 2024
00ef8ac
🐛 fix enum, import and *args errors
juanesarango May 31, 2024
4d6eba9
🐛 fix python2 syntaxis
juanesarango May 31, 2024
bc0f1a0
🐛 fix python2 syntaxis
juanesarango May 31, 2024
f6cfb00
🐛 fix python2 syntaxis
juanesarango May 31, 2024
9a4bfbd
🐛 remove job env from issueBatchJob
juanesarango May 31, 2024
4aea8ee
🐛 handle exit_code None
juanesarango May 31, 2024
fc6c1da
🐛 fix exit_code return
juanesarango May 31, 2024
32b6e66
🔧 catch additional errno codes when linking fileJobStore
juanesarango Jul 11, 2024
1c7a109
🔧 fix exit code return from slurm
juanesarango Jul 11, 2024
48ab174
🔧 add env TOIL_SLURM_PER_CPU to control memory in sbatch
juanesarango Jul 11, 2024
90ecfd0
🐛 fix toil -> slurm job dict to rrerun memory retry
juanesarango Jul 12, 2024
991eda7
🔧 skip error code EPERM when kill pid fails
juanesarango Jul 12, 2024
e1f9159
🔧 allow to retry twice for oom retry. Log retry slurm job id and reso…
juanesarango Jul 15, 2024
c71ac1c
🔧 move instead of rename in fileJobStore
juanesarango Jul 22, 2024
5acca47
🔧 resubmit OOM falied if memory < MAX_MEMORY
juanesarango Jul 23, 2024
5f8373c
🐛 catch slurm sacct errors when not available
juanesarango Aug 2, 2024
d6bc2bc
🔧 add with_retries, tune logging and export job_ids
juanesarango Nov 15, 2024
8a75e03
🐛 catch proper Exception in call command
juanesarango Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def runSetup():
addict = 'addict<=2.2.0'
sphinx = 'sphinx==1.7.5'
pathlib2 = 'pathlib2==2.3.2'
enum34 = 'enum34==1.1.10'

core_reqs = [
dill,
Expand All @@ -71,7 +72,8 @@ def runSetup():
subprocess32,
addict,
sphinx,
pathlib2]
pathlib2,
enum34]

aws_reqs = [
boto,
Expand Down
39 changes: 39 additions & 0 deletions src/toil/batchSystems/abstractBatchSystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
standard_library.install_aliases()
from future.utils import with_metaclass
from builtins import object
import enum
import os
import shutil
import logging
Expand All @@ -37,6 +38,44 @@

logger = logging.getLogger(__name__)

# Value to use as exitStatus in UpdatedBatchJobInfo.exitStatus when status is not available.
EXIT_STATUS_UNAVAILABLE_VALUE = 255

class BatchJobExitReason(enum.IntEnum):
FINISHED = 1
"""Successfully finished."""
FAILED = 2
"""Job finished, but failed."""
LOST = 3
"""Preemptable failure (job's executing host went away)."""
KILLED = 4
"""Job killed before finishing."""
ERROR = 5
"""Internal error."""
MEMLIMIT = 6
"""Job hit batch system imposed memory limit."""
MISSING = 7
"""Job disappeared from the scheduler without actually stopping, so Toil killed it."""
MAXJOBDURATION = 8
"""Job ran longer than --maxJobDuration, so Toil killed it."""
PARTITION = 9
"""Job was not able to talk to the leader via the job store, so Toil declared it failed."""


@classmethod
def to_string(cls, value):
"""
Convert to human-readable string.

Given an int that may be or may be equal to a value from the enum,
produce the string value of its matching enum entry, or a stringified
int.
"""
try:
return cls(value).name
except ValueError:
return str(value)


# A class containing the information required for worker cleanup on shutdown of the batch system.
WorkerCleanupInfo = namedtuple('WorkerCleanupInfo', (
Expand Down
21 changes: 14 additions & 7 deletions src/toil/batchSystems/abstractGridEngineBatchSystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,31 @@

from toil import subprocess
from toil.lib.objects import abstractclassmethod
from toil.lib.misc import CalledProcessErrorStderr

from toil.batchSystems.abstractBatchSystem import BatchSystemLocalSupport

logger = logging.getLogger(__name__)


def with_retries(operation, *args, **kwargs):
retries = 3
latest_err = None
while retries:
retries -= 1
"""Add an incremental sleep after each retry."""
latest_err = Exception

for i in [1, 5, 10, 60, 90, 120]:
try:
return operation(*args, **kwargs)
except subprocess.CalledProcessError as err:
except CalledProcessErrorStderr as err:
latest_err = err
logger.error(
"Operation %s failed with code %d: %s",
operation, err.returncode, err.output)
operation,
err.returncode,
err.output,
)
logger.error("Retrying in %s", str(i))
time.sleep(i)

raise latest_err


Expand Down Expand Up @@ -234,7 +241,7 @@ def run(self):
activity |= self.createJobs(newJob)
activity |= self.checkOnJobs()
if not activity:
logger.debug('No activity, sleeping for %is', self.boss.sleepSeconds())
pass # logger.debug('No activity, sleeping for %is', self.boss.sleepSeconds())

@abstractmethod
def prepareSubmission(self, cpu, memory, jobID, command, jobName):
Expand Down
Loading