Skip to content

Commit

Permalink
Remove RC2/step8 and move resubmit test to step5.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Sep 7, 2024
1 parent afc067c commit acae850
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 25 deletions.
18 changes: 10 additions & 8 deletions data/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ rc2 = (
clobber_outputs=True,
)
.add("step2a", where="skymap='ci_mw'")
# Test fail-and-rescue workflow by configuring one task fail on the first
# try, and then recover from that with a new run and --skip-existing-in.
.add("step2b", where="skymap='ci_mw' AND tract=0")
.add("step2cde")
.add(
Expand All @@ -125,18 +127,20 @@ rc2 = (
fail=[
# Configure one quantum to fail on the first try and succeed on the
# second, simulating an out-of-memory failure and automatic retry.
'''assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB'''
"""assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB"""
],
auto_retry_mem=("4GB", "8GB"),
)
.add("step4")
.add("step5", where="skymap='ci_mw' AND tract=0")
.add(
"step5",
where="skymap='ci_mw' AND tract=0",
group="attempt1",
fail=['''consolidateForcedSourceTable::"skymap='ci_mw' AND tract=0"'''],
)
.add("step5", where="skymap='ci_mw' AND tract=0", group="attempt2", skip_existing_in_last=True)
.add("step6")
.add("step7")
# Test fail-and-rescue workflow by configuring one task fail on the first
# try, and then recover from that with a new run and --skip-existing-in.
.add("step8", group="attempt1", fail=['''analyzeObjectTableCore::"skymap='ci_mw' AND tract=0"'''])
.add("step8", group="attempt2", skip_existing_in_last=True)
.finish()
)
state.targets["data"].extend(rc2)
Expand Down Expand Up @@ -175,8 +179,6 @@ prod = (
.add("step4", group="r", where="band='r'")
.add("step4", group="i", where="band='i'")
.add("step7")
# step8 in DRP-Prod is not actually valid, because it has tasks that should
# be partitioned in different ways in order to scale up. This is DM-39314.
.finish()
)
state.targets["data"].extend(prod)
Expand Down
34 changes: 17 additions & 17 deletions tests/test_rc2_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ def test_property_set_metadata_direct(self) -> None:
def test_property_set_metadata_qbb(self) -> None:
self.qbb.check_property_set_metadata(self)

def check_step8_rescue(self, helper: OutputRepoTests) -> None:
"""Test that the fail-and-recover attempts in step8 worked as expected,
def check_step5_rescue(self, helper: OutputRepoTests) -> None:
"""Test that the fail-and-recover attempts in step5 worked as expected,
by running all tasks but one in the first attempt.
"""
# This task should have failed in attempt1 and should have been
Expand All @@ -116,40 +116,40 @@ def check_step8_rescue(self, helper: OutputRepoTests) -> None:
[
ref.run
for ref in set(
helper.butler.registry.queryDatasets(get_mock_name("analyzeObjectTableCore_metadata"))
helper.butler.registry.queryDatasets(
get_mock_name("consolidateForcedSourceTable_metadata")
)
)
],
["HSC/runs/RC2/step8-attempt2"],
["HSC/runs/RC2/step5-attempt2"],
)
# This task should have succeeded in attempt1 and should not have been
# included in attempt2.
self.assertCountEqual(
[
ref.run
for ref in set(
helper.butler.registry.queryDatasets(
get_mock_name("analyzeObjectTableSurveyCore_metadata")
)
helper.butler.registry.queryDatasets(get_mock_name("transformForcedSourceTable_metadata"))
)
],
["HSC/runs/RC2/step8-attempt1"],
["HSC/runs/RC2/step5-attempt1"] * 4, # one for each patch,
)

def test_step8_rescue_direct(self) -> None:
self.check_step8_rescue(self.direct)
def test_step5_rescue_direct(self) -> None:
self.check_step5_rescue(self.direct)
# The attempt1 QG should have quanta for both tasks (and others, but we
# won't list them all to avoid breaking if new ones are added).
qg_1 = self.direct.get_quantum_graph("step8", "attempt1")
qg_2 = self.direct.get_quantum_graph("step8", "attempt2")
qg_1 = self.direct.get_quantum_graph("step5", "attempt1")
qg_2 = self.direct.get_quantum_graph("step5", "attempt2")
tasks_with_quanta_1 = {q.taskDef.label for q in qg_1}
tasks_with_quanta_2 = {q.taskDef.label for q in qg_2}
self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_1)
self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_2)
self.assertIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_1)
self.assertNotIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_2)
self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_1)
self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_2)
self.assertIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_1)
self.assertNotIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_2)

def test_step8_rescue_qbb(self) -> None:
self.check_step8_rescue(self.qbb)
self.check_step5_rescue(self.qbb)

def test_fgcm_refcats(self) -> None:
"""Test that FGCM does not get refcats that don't overlap any of its
Expand Down

0 comments on commit acae850

Please sign in to comment.