From 0747ab26239de1321ebfa704fe39d9a096e0f817 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 9 Sep 2024 15:04:02 +0200 Subject: [PATCH] Fix job metric collection when job fails --- lib/galaxy/jobs/__init__.py | 16 +++++++++++++++- lib/galaxy/jobs/runners/pulsar.py | 11 ++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index d63ba30916df..939a529900ad 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1379,7 +1379,15 @@ def _fix_output_permissions(self): util.umask_fix_perms(path, self.app.config.umask, 0o666, self.app.config.gid) def fail( - self, message, exception=False, tool_stdout="", tool_stderr="", exit_code=None, job_stdout=None, job_stderr=None + self, + message, + exception=False, + tool_stdout="", + tool_stderr="", + exit_code=None, + job_stdout=None, + job_stderr=None, + job_metrics_directory=None, ): """ Indicate job failure by setting state and message on all output @@ -1406,6 +1414,10 @@ def fail( message = str(message) working_directory_exists = self.working_directory_exists() + if not job.tasks: + # If job was composed of tasks, don't attempt to recollect statistics + self._collect_metrics(job, job_metrics_directory) + # if the job was deleted, don't fail it if not job.state == job.states.DELETED: # Check if the failure is due to an exception @@ -1481,6 +1493,7 @@ def fail( pjaa.post_job_action for pjaa in job.post_job_actions if pjaa.post_job_action.action_type == "EmailAction" ]: ActionBox.execute(self.app, self.sa_session, pja, job) + # If the job was deleted, call tool specific fail actions (used for e.g. external metadata) and clean up if self.tool: try: @@ -1841,6 +1854,7 @@ def fail(message=job.info, exception=None): job_stdout=job_stdout, job_stderr=job_stderr, exception=exception, + job_metrics_directory=job_metrics_directory, ) # TODO: After failing here, consider returning from the function. diff --git a/lib/galaxy/jobs/runners/pulsar.py b/lib/galaxy/jobs/runners/pulsar.py index 58cee82f3f29..4bfb47ae296d 100644 --- a/lib/galaxy/jobs/runners/pulsar.py +++ b/lib/galaxy/jobs/runners/pulsar.py @@ -689,16 +689,9 @@ def finish_job(self, job_state: JobState): ) as exit_code_file: exit_code_file.write(str(exit_code)) self._handle_metadata_externally(job_wrapper, resolve_requirements=True) + job_metrics_directory = os.path.join(job_wrapper.working_directory, "metadata") # Finish the job try: - job_metrics_directory = os.path.join(job_wrapper.working_directory, "metadata") - # Following check is a hack for jobs started during 19.01 or earlier release - # and finishing with a 19.05 code base. Eliminate the hack in 19.09 or later - # along with hacks for legacy metadata compute strategy. - if not os.path.exists(job_metrics_directory) or not any( - "__instrument" in f for f in os.listdir(job_metrics_directory) - ): - job_metrics_directory = job_wrapper.working_directory job_wrapper.finish( tool_stdout, tool_stderr, @@ -710,7 +703,7 @@ def finish_job(self, job_state: JobState): ) except Exception: log.exception("Job wrapper finish method failed") - job_wrapper.fail("Unable to finish job", exception=True) + job_wrapper.fail("Unable to finish job", exception=True, job_metrics_directory=job_metrics_directory) def check_pid(self, pid): try: