diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cd39cb889..e682f38254 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## v2.2.2 - 2023-02-10 + +### Fixes +- Critical bug fix: Fix bug causing `CalcJob`s to except after restarting daemon [#5886]](https://github.com/aiidateam/aiida-core/pull/5886) + + ## v2.2.1 - 2022-12-22 ### Fixes diff --git a/aiida/__init__.py b/aiida/__init__.py index 566177ed43..0781d904d3 100644 --- a/aiida/__init__.py +++ b/aiida/__init__.py @@ -28,7 +28,7 @@ 'For further information please visit http://www.aiida.net/. All rights reserved.' ) __license__ = 'MIT license, see LICENSE.txt file.' -__version__ = '2.2.1.post0' +__version__ = '2.2.2.post0' __authors__ = 'The AiiDA team.' __paper__ = ( 'S. P. Huber et al., "AiiDA 1.0, a scalable computational infrastructure for automated reproducible workflows and ' diff --git a/aiida/engine/processes/calcjobs/tasks.py b/aiida/engine/processes/calcjobs/tasks.py index a6d47d73f6..b8f681bcf2 100644 --- a/aiida/engine/processes/calcjobs/tasks.py +++ b/aiida/engine/processes/calcjobs/tasks.py @@ -430,7 +430,7 @@ async def do_kill(): return result -@plumpy.persistence.auto_persist('msg', 'data', '_monitor_result') +@plumpy.persistence.auto_persist('msg', 'data', '_command', '_monitor_result') class Waiting(plumpy.process_states.Waiting): """The waiting state for the `CalcJob` process.""" @@ -451,15 +451,26 @@ def __init__( self._monitor_result: CalcJobMonitorResult | None = None self._monitors: CalcJobMonitors | None = None - if 'monitors' in self.process.node.inputs: - self._monitors = CalcJobMonitors(self.process.node.inputs.monitors) - if isinstance(self.data, dict): self._command = self.data['command'] self._monitor_result = self.data.get('monitor_result', None) else: self._command = self.data + @property + def monitors(self) -> CalcJobMonitors | None: + """Return the collection of monitors if specified in the inputs. + + :return: Instance of ``CalcJobMonitors`` containing monitors if specified in the process' input. + """ + if not hasattr(self, '_monitors'): + self._monitors = None + + if self._monitors is None and 'monitors' in self.process.node.inputs: + self._monitors = CalcJobMonitors(self.process.node.inputs.monitors) + + return self._monitors + @property def process(self) -> 'CalcJob': """ @@ -510,7 +521,7 @@ async def execute(self) -> plumpy.process_states.State: # type: ignore[override process_status = f'Monitoring scheduler: job state {scheduler_state_string}' node.set_process_status(process_status) job_done = await self._launch_task(task_update_job, node, self.process.runner.job_manager) - monitor_result = await self._monitor_job(node, transport_queue, self._monitors) + monitor_result = await self._monitor_job(node, transport_queue, self.monitors) if monitor_result and monitor_result.action is CalcJobMonitorAction.KILL: await self._kill_job(node, transport_queue) diff --git a/tests/engine/processes/calcjobs/test_calc_job.py b/tests/engine/processes/calcjobs/test_calc_job.py index aa44aacb72..25f58f7e5b 100644 --- a/tests/engine/processes/calcjobs/test_calc_job.py +++ b/tests/engine/processes/calcjobs/test_calc_job.py @@ -1192,6 +1192,41 @@ def _parse_submit_output(self, *args): # pylint: disable=unused-argument assert node.exit_status == 418 +def test_restart_after_daemon_reset(get_calcjob_builder, daemon_client, submit_and_await): + """Test that a job can be restarted when it is launched and the daemon is restarted. + + This is a regression test for https://github.com/aiidateam/aiida-core/issues/5882. + """ + import time + + import plumpy + + daemon_client.start_daemon() + + # Launch a job with a one second sleep to ensure it doesn't finish before we get the chance to restart the daemon. + # A monitor is added to ensure that those are properly reinitialized in the ``Waiting`` state of the process. + builder = get_calcjob_builder() + builder.metadata.options.sleep = 1 + builder.monitors = {'monitor': orm.Dict({'entry_point': 'core.always_kill', 'disabled': True})} + node = submit_and_await(builder, plumpy.ProcessState.WAITING) + + daemon_client.restart_daemon(wait=True) + + start_time = time.time() + timeout = 10 + + while node.process_state not in [plumpy.ProcessState.FINISHED, plumpy.ProcessState.EXCEPTED]: + + if node.is_excepted: + raise AssertionError(f'The process excepted: {node.exception}') + + if time.time() - start_time >= timeout: + raise AssertionError(f'process failed to terminate within timeout, current state: {node.process_state}') + + assert node.is_finished, node.process_state + assert node.is_finished_ok, node.exit_status + + class TestImport: """Test the functionality to import existing calculations completed outside of AiiDA."""