Skip to content

Commit

Permalink
Fix job metric collection when job fails
Browse files Browse the repository at this point in the history
  • Loading branch information
mvdbeek committed Sep 13, 2024
1 parent a4e33aa commit 6f709cc
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
16 changes: 15 additions & 1 deletion lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1379,7 +1379,15 @@ def _fix_output_permissions(self):
util.umask_fix_perms(path, self.app.config.umask, 0o666, self.app.config.gid)

def fail(
self, message, exception=False, tool_stdout="", tool_stderr="", exit_code=None, job_stdout=None, job_stderr=None
self,
message,
exception=False,
tool_stdout="",
tool_stderr="",
exit_code=None,
job_stdout=None,
job_stderr=None,
job_metrics_directory=None,
):
"""
Indicate job failure by setting state and message on all output
Expand All @@ -1406,6 +1414,10 @@ def fail(
message = str(message)
working_directory_exists = self.working_directory_exists()

if not job.tasks:
# If job was composed of tasks, don't attempt to recollect statistics
self._collect_metrics(job, job_metrics_directory)

# if the job was deleted, don't fail it
if not job.state == job.states.DELETED:
# Check if the failure is due to an exception
Expand Down Expand Up @@ -1481,6 +1493,7 @@ def fail(
pjaa.post_job_action for pjaa in job.post_job_actions if pjaa.post_job_action.action_type == "EmailAction"
]:
ActionBox.execute(self.app, self.sa_session, pja, job)

# If the job was deleted, call tool specific fail actions (used for e.g. external metadata) and clean up
if self.tool:
try:
Expand Down Expand Up @@ -1841,6 +1854,7 @@ def fail(message=job.info, exception=None):
job_stdout=job_stdout,
job_stderr=job_stderr,
exception=exception,
job_metrics_directory=job_metrics_directory,
)

# TODO: After failing here, consider returning from the function.
Expand Down
11 changes: 2 additions & 9 deletions lib/galaxy/jobs/runners/pulsar.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,16 +689,9 @@ def finish_job(self, job_state: JobState):
) as exit_code_file:
exit_code_file.write(str(exit_code))
self._handle_metadata_externally(job_wrapper, resolve_requirements=True)
job_metrics_directory = os.path.join(job_wrapper.working_directory, "metadata")
# Finish the job
try:
job_metrics_directory = os.path.join(job_wrapper.working_directory, "metadata")
# Following check is a hack for jobs started during 19.01 or earlier release
# and finishing with a 19.05 code base. Eliminate the hack in 19.09 or later
# along with hacks for legacy metadata compute strategy.
if not os.path.exists(job_metrics_directory) or not any(
"__instrument" in f for f in os.listdir(job_metrics_directory)
):
job_metrics_directory = job_wrapper.working_directory
job_wrapper.finish(
tool_stdout,
tool_stderr,
Expand All @@ -710,7 +703,7 @@ def finish_job(self, job_state: JobState):
)
except Exception:
log.exception("Job wrapper finish method failed")
job_wrapper.fail("Unable to finish job", exception=True)
job_wrapper.fail("Unable to finish job", exception=True, job_metrics_directory=job_metrics_directory)

def check_pid(self, pid):
try:
Expand Down

0 comments on commit 6f709cc

Please sign in to comment.