diff --git a/data/SConscript b/data/SConscript index 9b0b8e2..f97e81f 100644 --- a/data/SConscript +++ b/data/SConscript @@ -117,6 +117,8 @@ rc2 = ( clobber_outputs=True, ) .add("step2a", where="skymap='ci_mw'") + # Test fail-and-rescue workflow by configuring one task fail on the first + # try, and then recover from that with a new run and --skip-existing-in. .add("step2b", where="skymap='ci_mw' AND tract=0") .add("step2cde") .add( @@ -125,18 +127,20 @@ rc2 = ( fail=[ # Configure one quantum to fail on the first try and succeed on the # second, simulating an out-of-memory failure and automatic retry. - '''assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB''' + """assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB""" ], auto_retry_mem=("4GB", "8GB"), ) .add("step4") - .add("step5", where="skymap='ci_mw' AND tract=0") + .add( + "step5", + where="skymap='ci_mw' AND tract=0", + group="attempt1", + fail=['''consolidateForcedSourceTable::"skymap='ci_mw' AND tract=0"'''], + ) + .add("step5", where="skymap='ci_mw' AND tract=0", group="attempt2", skip_existing_in_last=True) .add("step6") .add("step7") - # Test fail-and-rescue workflow by configuring one task fail on the first - # try, and then recover from that with a new run and --skip-existing-in. - .add("step8", group="attempt1", fail=['''analyzeObjectTableCore::"skymap='ci_mw' AND tract=0"''']) - .add("step8", group="attempt2", skip_existing_in_last=True) .finish() ) state.targets["data"].extend(rc2) @@ -175,8 +179,6 @@ prod = ( .add("step4", group="r", where="band='r'") .add("step4", group="i", where="band='i'") .add("step7") - # step8 in DRP-Prod is not actually valid, because it has tasks that should - # be partitioned in different ways in order to scale up. This is DM-39314. .finish() ) state.targets["data"].extend(prod) diff --git a/tests/test_rc2_outputs.py b/tests/test_rc2_outputs.py index b8914eb..4dc57b9 100644 --- a/tests/test_rc2_outputs.py +++ b/tests/test_rc2_outputs.py @@ -106,8 +106,8 @@ def test_property_set_metadata_direct(self) -> None: def test_property_set_metadata_qbb(self) -> None: self.qbb.check_property_set_metadata(self) - def check_step8_rescue(self, helper: OutputRepoTests) -> None: - """Test that the fail-and-recover attempts in step8 worked as expected, + def check_step5_rescue(self, helper: OutputRepoTests) -> None: + """Test that the fail-and-recover attempts in step5 worked as expected, by running all tasks but one in the first attempt. """ # This task should have failed in attempt1 and should have been @@ -116,10 +116,12 @@ def check_step8_rescue(self, helper: OutputRepoTests) -> None: [ ref.run for ref in set( - helper.butler.registry.queryDatasets(get_mock_name("analyzeObjectTableCore_metadata")) + helper.butler.registry.queryDatasets( + get_mock_name("consolidateForcedSourceTable_metadata") + ) ) ], - ["HSC/runs/RC2/step8-attempt2"], + ["HSC/runs/RC2/step5-attempt2"], ) # This task should have succeeded in attempt1 and should not have been # included in attempt2. @@ -127,29 +129,27 @@ def check_step8_rescue(self, helper: OutputRepoTests) -> None: [ ref.run for ref in set( - helper.butler.registry.queryDatasets( - get_mock_name("analyzeObjectTableSurveyCore_metadata") - ) + helper.butler.registry.queryDatasets(get_mock_name("transformForcedSourceTable_metadata")) ) ], - ["HSC/runs/RC2/step8-attempt1"], + ["HSC/runs/RC2/step5-attempt1"] * 4, # one for each patch, ) - def test_step8_rescue_direct(self) -> None: - self.check_step8_rescue(self.direct) + def test_step5_rescue_direct(self) -> None: + self.check_step5_rescue(self.direct) # The attempt1 QG should have quanta for both tasks (and others, but we # won't list them all to avoid breaking if new ones are added). - qg_1 = self.direct.get_quantum_graph("step8", "attempt1") - qg_2 = self.direct.get_quantum_graph("step8", "attempt2") + qg_1 = self.direct.get_quantum_graph("step5", "attempt1") + qg_2 = self.direct.get_quantum_graph("step5", "attempt2") tasks_with_quanta_1 = {q.taskDef.label for q in qg_1} tasks_with_quanta_2 = {q.taskDef.label for q in qg_2} - self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_1) - self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_2) - self.assertIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_1) - self.assertNotIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_2) + self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_1) + self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_2) + self.assertIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_1) + self.assertNotIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_2) def test_step8_rescue_qbb(self) -> None: - self.check_step8_rescue(self.qbb) + self.check_step5_rescue(self.qbb) def test_fgcm_refcats(self) -> None: """Test that FGCM does not get refcats that don't overlap any of its