Skip to content

Commit

Permalink
Merge pull request #39 from lsst/tickets/DM-44457
Browse files Browse the repository at this point in the history
DM-44457: Fix held and deleted state_counts for reporting.
  • Loading branch information
MichelleGower authored Jul 26, 2024
2 parents b1d423d + 8f8f81f commit c28e532
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/changes/DM-44457.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed held and deleted state_counts for reporting.
11 changes: 11 additions & 0 deletions doc/lsst.ctrl.bps.htcondor/userguide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,17 @@ See `bps report`_.
.. Describe any plugin specific aspects of checking a submission status below
if any.
In order to make the summary report (``bps report``) faster, the plugin
uses summary information available with the DAGMan job. For a running
DAG, this status can lag behind by a few minutes. Also, DAGMan tracks
deletion of individual jobs as failures (no separate counts for
deleted jobs). So the summary report flag column will show ``F`` when
there are either failed or deleted jobs. If getting a detailed report
(``bps report --id <id>``), the plugin reads detailed job information
from files. So, the detailed report can distinguish between failed and
deleted jobs, and thus will show ``D`` in the flag column for a running
workflow if there is a deleted job.

Occasionally, some jobs are put on hold by HTCondor. To see the reason why
jobs are being held, use

Expand Down
4 changes: 3 additions & 1 deletion python/lsst/ctrl/bps/htcondor/htcondor_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,7 +1527,7 @@ def _get_state_counts_from_dag_job(job):
state_counts = {
WmsStates.UNREADY: job.get("DAG_NodesUnready", 0),
WmsStates.READY: job.get("DAG_NodesReady", 0),
WmsStates.HELD: job.get("JobProcsHeld", 0),
WmsStates.HELD: job.get("DAG_JobsHeld", 0),
WmsStates.SUCCEEDED: job.get("DAG_NodesDone", 0),
WmsStates.FAILED: job.get("DAG_NodesFailed", 0),
WmsStates.PRUNED: job.get("DAG_NodesFutile", 0),
Expand Down Expand Up @@ -1660,6 +1660,8 @@ def _htc_node_status_to_wms_state(job):
# Use job exit status instead of post script exit status.
if "DAGMAN error 0" in job["StatusDetails"]:
wms_state = WmsStates.SUCCEEDED
elif "ULOG_JOB_ABORTED" in job["StatusDetails"]:
wms_state = WmsStates.DELETED
else:
wms_state = WmsStates.FAILED
case NodeStatus.FUTILE:
Expand Down
46 changes: 46 additions & 0 deletions tests/test_htcondor_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
JobStatus,
NodeStatus,
_get_exit_code_summary,
_get_state_counts_from_dag_job,
_htc_node_status_to_wms_state,
_htc_status_to_wms_state,
_translate_job_cmds,
Expand Down Expand Up @@ -233,6 +234,15 @@ def testFutile(self):
result = _htc_node_status_to_wms_state(job)
self.assertEqual(result, WmsStates.PRUNED)

def testDeletedJob(self):
job = {
"NodeStatus": NodeStatus.ERROR,
"StatusDetails": "HTCondor reported ULOG_JOB_ABORTED event for job proc (1.0.0)",
"JobProcsQueued": 0,
}
result = _htc_node_status_to_wms_state(job)
self.assertEqual(result, WmsStates.DELETED)


class TweakJobInfoTestCase(unittest.TestCase):
"""Test the function responsible for massaging job information."""
Expand Down Expand Up @@ -413,3 +423,39 @@ def testRetryUnlessBad(self):
with self.assertRaises(ValueError) as cm:
_ = _translate_job_cmds(self.cached_vals, None, gwjob)
self.assertIn("retryUnlessExit", str(cm.exception))


class GetStateCountsFromDagJobTestCase(unittest.TestCase):
"""Test counting number of jobs per WMS state."""

def setUp(self):
pass

def tearDown(self):
pass

def testCounts(self):
job = {
"DAG_NodesDone": 1,
"DAG_JobsHeld": 2,
"DAG_NodesFailed": 3,
"DAG_NodesFutile": 4,
"DAG_NodesQueued": 5,
"DAG_NodesReady": 0,
"DAG_NodesUnready": 7,
"DAG_NodesTotal": 22,
}

truth = {
WmsStates.SUCCEEDED: 1,
WmsStates.HELD: 2,
WmsStates.UNREADY: 7,
WmsStates.READY: 0,
WmsStates.FAILED: 3,
WmsStates.PRUNED: 4,
WmsStates.MISFIT: 0,
}

total, result = _get_state_counts_from_dag_job(job)
self.assertEqual(total, 22)
self.assertEqual(result, truth)

0 comments on commit c28e532

Please sign in to comment.