Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-46145: Remove RC2/step8 and move resubmit test to step5. #26

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions data/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ rc2 = (
clobber_outputs=True,
)
.add("step2a", where="skymap='ci_mw'")
# Test fail-and-rescue workflow by configuring one task fail on the first
# try, and then recover from that with a new run and --skip-existing-in.
.add("step2b", where="skymap='ci_mw' AND tract=0")
.add("step2cde")
.add(
Expand All @@ -125,18 +127,20 @@ rc2 = (
fail=[
# Configure one quantum to fail on the first try and succeed on the
# second, simulating an out-of-memory failure and automatic retry.
'''assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB'''
"""assembleCoadd::"skymap='ci_mw' AND tract=0 AND patch=2 AND band='r'":6GB"""
],
auto_retry_mem=("4GB", "8GB"),
)
.add("step4")
.add("step5", where="skymap='ci_mw' AND tract=0")
.add(
"step5",
where="skymap='ci_mw' AND tract=0",
group="attempt1",
fail=['''consolidateForcedSourceTable::"skymap='ci_mw' AND tract=0"'''],
)
.add("step5", where="skymap='ci_mw' AND tract=0", group="attempt2", skip_existing_in_last=True)
.add("step6")
.add("step7")
# Test fail-and-rescue workflow by configuring one task fail on the first
# try, and then recover from that with a new run and --skip-existing-in.
.add("step8", group="attempt1", fail=['''analyzeObjectTableCore::"skymap='ci_mw' AND tract=0"'''])
.add("step8", group="attempt2", skip_existing_in_last=True)
.finish()
)
state.targets["data"].extend(rc2)
Expand Down Expand Up @@ -175,8 +179,6 @@ prod = (
.add("step4", group="r", where="band='r'")
.add("step4", group="i", where="band='i'")
.add("step7")
# step8 in DRP-Prod is not actually valid, because it has tasks that should
# be partitioned in different ways in order to scale up. This is DM-39314.
.finish()
)
state.targets["data"].extend(prod)
Expand Down
34 changes: 17 additions & 17 deletions tests/test_rc2_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ def test_property_set_metadata_direct(self) -> None:
def test_property_set_metadata_qbb(self) -> None:
self.qbb.check_property_set_metadata(self)

def check_step8_rescue(self, helper: OutputRepoTests) -> None:
"""Test that the fail-and-recover attempts in step8 worked as expected,
def check_step5_rescue(self, helper: OutputRepoTests) -> None:
"""Test that the fail-and-recover attempts in step5 worked as expected,
by running all tasks but one in the first attempt.
"""
# This task should have failed in attempt1 and should have been
Expand All @@ -116,40 +116,40 @@ def check_step8_rescue(self, helper: OutputRepoTests) -> None:
[
ref.run
for ref in set(
helper.butler.registry.queryDatasets(get_mock_name("analyzeObjectTableCore_metadata"))
helper.butler.registry.queryDatasets(
get_mock_name("consolidateForcedSourceTable_metadata")
)
)
],
["HSC/runs/RC2/step8-attempt2"],
["HSC/runs/RC2/step5-attempt2"],
)
# This task should have succeeded in attempt1 and should not have been
# included in attempt2.
self.assertCountEqual(
[
ref.run
for ref in set(
helper.butler.registry.queryDatasets(
get_mock_name("analyzeObjectTableSurveyCore_metadata")
)
helper.butler.registry.queryDatasets(get_mock_name("transformForcedSourceTable_metadata"))
)
],
["HSC/runs/RC2/step8-attempt1"],
["HSC/runs/RC2/step5-attempt1"] * 4, # one for each patch,
)

def test_step8_rescue_direct(self) -> None:
self.check_step8_rescue(self.direct)
def test_step5_rescue_direct(self) -> None:
self.check_step5_rescue(self.direct)
# The attempt1 QG should have quanta for both tasks (and others, but we
# won't list them all to avoid breaking if new ones are added).
qg_1 = self.direct.get_quantum_graph("step8", "attempt1")
qg_2 = self.direct.get_quantum_graph("step8", "attempt2")
qg_1 = self.direct.get_quantum_graph("step5", "attempt1")
qg_2 = self.direct.get_quantum_graph("step5", "attempt2")
tasks_with_quanta_1 = {q.taskDef.label for q in qg_1}
tasks_with_quanta_2 = {q.taskDef.label for q in qg_2}
self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_1)
self.assertIn(get_mock_name("analyzeObjectTableCore"), tasks_with_quanta_2)
self.assertIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_1)
self.assertNotIn(get_mock_name("analyzeObjectTableSurveyCore"), tasks_with_quanta_2)
self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_1)
self.assertIn(get_mock_name("consolidateForcedSourceTable"), tasks_with_quanta_2)
self.assertIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_1)
self.assertNotIn(get_mock_name("transformForcedSourceTable"), tasks_with_quanta_2)

def test_step8_rescue_qbb(self) -> None:
self.check_step8_rescue(self.qbb)
self.check_step5_rescue(self.qbb)

def test_fgcm_refcats(self) -> None:
"""Test that FGCM does not get refcats that don't overlap any of its
Expand Down
Loading