diff --git a/doc/changes/DM-44457.misc.rst b/doc/changes/DM-44457.misc.rst new file mode 100644 index 0000000..d7e4a2d --- /dev/null +++ b/doc/changes/DM-44457.misc.rst @@ -0,0 +1 @@ +Fixed held and deleted state_counts for reporting. diff --git a/doc/lsst.ctrl.bps.htcondor/userguide.rst b/doc/lsst.ctrl.bps.htcondor/userguide.rst index ff69c74..bd06d5d 100644 --- a/doc/lsst.ctrl.bps.htcondor/userguide.rst +++ b/doc/lsst.ctrl.bps.htcondor/userguide.rst @@ -115,6 +115,17 @@ See `bps report`_. .. Describe any plugin specific aspects of checking a submission status below if any. +In order to make the summary report (``bps report``) faster, the plugin +uses summary information available with the DAGMan job. For a running +DAG, this status can lag behind by a few minutes. Also, DAGMan tracks +deletion of individual jobs as failures (no separate counts for +deleted jobs). So the summary report flag column will show ``F`` when +there are either failed or deleted jobs. If getting a detailed report +(``bps report --id ``), the plugin reads detailed job information +from files. So, the detailed report can distinguish between failed and +deleted jobs, and thus will show ``D`` in the flag column for a running +workflow if there is a deleted job. + Occasionally, some jobs are put on hold by HTCondor. To see the reason why jobs are being held, use diff --git a/python/lsst/ctrl/bps/htcondor/htcondor_service.py b/python/lsst/ctrl/bps/htcondor/htcondor_service.py index 4d9ddd8..255d787 100644 --- a/python/lsst/ctrl/bps/htcondor/htcondor_service.py +++ b/python/lsst/ctrl/bps/htcondor/htcondor_service.py @@ -1527,7 +1527,7 @@ def _get_state_counts_from_dag_job(job): state_counts = { WmsStates.UNREADY: job.get("DAG_NodesUnready", 0), WmsStates.READY: job.get("DAG_NodesReady", 0), - WmsStates.HELD: job.get("JobProcsHeld", 0), + WmsStates.HELD: job.get("DAG_JobsHeld", 0), WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0), WmsStates.FAILED: job.get("DAG_NodesFailed", 0), WmsStates.PRUNED: job.get("DAG_NodesFutile", 0), @@ -1660,6 +1660,8 @@ def _htc_node_status_to_wms_state(job): # Use job exit status instead of post script exit status. if "DAGMAN error 0" in job["StatusDetails"]: wms_state = WmsStates.SUCCEEDED + elif "ULOG_JOB_ABORTED" in job["StatusDetails"]: + wms_state = WmsStates.DELETED else: wms_state = WmsStates.FAILED case NodeStatus.FUTILE: diff --git a/tests/test_htcondor_service.py b/tests/test_htcondor_service.py index 57db692..a687b5b 100644 --- a/tests/test_htcondor_service.py +++ b/tests/test_htcondor_service.py @@ -40,6 +40,7 @@ JobStatus, NodeStatus, _get_exit_code_summary, + _get_state_counts_from_dag_job, _htc_node_status_to_wms_state, _htc_status_to_wms_state, _translate_job_cmds, @@ -233,6 +234,15 @@ def testFutile(self): result = _htc_node_status_to_wms_state(job) self.assertEqual(result, WmsStates.PRUNED) + def testDeletedJob(self): + job = { + "NodeStatus": NodeStatus.ERROR, + "StatusDetails": "HTCondor reported ULOG_JOB_ABORTED event for job proc (1.0.0)", + "JobProcsQueued": 0, + } + result = _htc_node_status_to_wms_state(job) + self.assertEqual(result, WmsStates.DELETED) + class TweakJobInfoTestCase(unittest.TestCase): """Test the function responsible for massaging job information.""" @@ -413,3 +423,39 @@ def testRetryUnlessBad(self): with self.assertRaises(ValueError) as cm: _ = _translate_job_cmds(self.cached_vals, None, gwjob) self.assertIn("retryUnlessExit", str(cm.exception)) + + +class GetStateCountsFromDagJobTestCase(unittest.TestCase): + """Test counting number of jobs per WMS state.""" + + def setUp(self): + pass + + def tearDown(self): + pass + + def testCounts(self): + job = { + "DAG_NodesDone": 1, + "DAG_JobsHeld": 2, + "DAG_NodesFailed": 3, + "DAG_NodesFutile": 4, + "DAG_NodesQueued": 5, + "DAG_NodesReady": 0, + "DAG_NodesUnready": 7, + "DAG_NodesTotal": 22, + } + + truth = { + WmsStates.SUCCEEDED: 1, + WmsStates.HELD: 2, + WmsStates.UNREADY: 7, + WmsStates.READY: 0, + WmsStates.FAILED: 3, + WmsStates.PRUNED: 4, + WmsStates.MISFIT: 0, + } + + total, result = _get_state_counts_from_dag_job(job) + self.assertEqual(total, 22) + self.assertEqual(result, truth)