From 6ebfe8dd215bac588c86a954fdb31c95e962a941 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 11 Apr 2024 15:45:07 +0200 Subject: [PATCH 01/30] CLI: Add dumping functionality. --- src/aiida/cmdline/commands/cmd_calcjob.py | 77 ++++ src/aiida/cmdline/commands/cmd_devel.py | 1 + src/aiida/cmdline/commands/cmd_process.py | 164 +++++++- src/aiida/cmdline/params/options/main.py | 63 +++ src/aiida/engine/daemon/execmanager.py | 4 +- src/aiida/tools/dumping/__init__.py | 0 src/aiida/tools/dumping/processes.py | 455 ++++++++++++++++++++++ tests/conftest.py | 137 +++++++ tests/tools/dumping/test_processes.py | 235 +++++++++++ 9 files changed, 1121 insertions(+), 15 deletions(-) create mode 100644 src/aiida/tools/dumping/__init__.py create mode 100644 src/aiida/tools/dumping/processes.py create mode 100644 tests/tools/dumping/test_processes.py diff --git a/src/aiida/cmdline/commands/cmd_calcjob.py b/src/aiida/cmdline/commands/cmd_calcjob.py index df49ffc358..e798200adc 100644 --- a/src/aiida/cmdline/commands/cmd_calcjob.py +++ b/src/aiida/cmdline/commands/cmd_calcjob.py @@ -348,3 +348,80 @@ def get_remote_and_path(calcjob, path=None): f'nor does its associated process class `{calcjob.process_class.__class__.__name__}`\n' 'Please specify a path explicitly.' ) + + +@verdi_calcjob.command('dump') +@arguments.CALCULATION('calcjob', type=CalculationParamType(sub_classes=('aiida.node:process.calculation.calcjob',))) +@options.PATH() +@options.NO_NODE_INPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.USE_PRESUBMIT() +@options.OVERWRITE() +def dump( + calcjob, + path, + no_node_inputs, + include_attributes, + include_extras, + use_presubmit, + overwrite, +) -> None: + """Dump files involved in the execution of a `CalcJob`. + + Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are + contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus + mirrors the logical execution of the workflow, which can also be queried by running `verdi process status + ` on the command line. + + By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution + settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". + + When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in + principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the + job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also + note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run + separately. + + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA + node data for further inspection. + """ + + from aiida.tools.dumping.processes import ( + ProcessNodeYamlDumper, + calcjob_dump, + generate_default_dump_path, + make_dump_readme, + ) + + # Generate default parent folder + if str(path) == '.': + output_path = generate_default_dump_path(process_node=calcjob) + else: + output_path = path.resolve() + + # Instantiate YamlDumper + calcjobnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) + + calcjob_dumped = calcjob_dump( + calcjob_node=calcjob, + output_path=output_path, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=calcjobnode_dumper, + overwrite=overwrite, + ) + + # Create README in parent directory + # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and + # workchain_dump files such that they can also be used from within the Python API, not just via verdi + make_dump_readme(output_path=output_path, process_node=calcjob) + + # Communicate success/failure of dumping + if calcjob_dumped: + echo.echo_success( + f'Raw files for {calcjob.__class__.__name__} <{calcjob.pk}> dumped successfully in `{output_path}`.' + ) + else: + echo.echo_report(f'Problem dumping {calcjob.__class__.__name__} <{calcjob.pk}>.') diff --git a/src/aiida/cmdline/commands/cmd_devel.py b/src/aiida/cmdline/commands/cmd_devel.py index d42f0f1451..25fa2cb6d0 100644 --- a/src/aiida/cmdline/commands/cmd_devel.py +++ b/src/aiida/cmdline/commands/cmd_devel.py @@ -39,6 +39,7 @@ def devel_check_load_time(): from aiida.manage import get_manager loaded_aiida_modules = [key for key in sys.modules if key.startswith('aiida.')] + # print(loaded_aiida_modules) aiida_modules_str = '\n- '.join(sorted(loaded_aiida_modules)) echo.echo_info(f'aiida modules loaded:\n- {aiida_modules_str}') diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 3f37645e5e..5997897179 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -114,7 +114,11 @@ def process_list( builder = CalculationQueryBuilder() filters = builder.get_filters(all_entries, process_state, process_label, paused, exit_status, failed) query_set = builder.get_query_set( - relationships=relationships, filters=filters, order_by={order_by: order_dir}, past_days=past_days, limit=limit + relationships=relationships, + filters=filters, + order_by={order_by: order_dir}, + past_days=past_days, + limit=limit, ) projected = builder.get_projected(query_set, projections=project) headers = projected.pop(0) @@ -161,7 +165,10 @@ def process_list( slots_per_worker = ctx.obj.config.get_option('daemon.worker_process_slots', scope=ctx.obj.profile.name) active_processes = ( QueryBuilder() - .append(ProcessNode, filters={'attributes.process_state': {'in': ('created', 'waiting', 'running')}}) + .append( + ProcessNode, + filters={'attributes.process_state': {'in': ('created', 'waiting', 'running')}}, + ) .count() ) available_slots = active_workers * slots_per_worker @@ -227,7 +234,13 @@ def process_call_root(processes): @verdi_process.command('report') @arguments.PROCESSES() @options.MOST_RECENT_NODE() -@click.option('-i', '--indent-size', type=int, default=2, help='Set the number of spaces to indent each level by.') +@click.option( + '-i', + '--indent-size', + type=int, + default=2, + help='Set the number of spaces to indent each level by.', +) @click.option( '-l', '--levelname', @@ -236,12 +249,21 @@ def process_call_root(processes): help='Filter the results by name of the log level.', ) @click.option( - '-m', '--max-depth', 'max_depth', type=int, default=None, help='Limit the number of levels to be printed.' + '-m', + '--max-depth', + 'max_depth', + type=int, + default=None, + help='Limit the number of levels to be printed.', ) @decorators.with_dbenv() def process_report(processes, most_recent_node, levelname, indent_size, max_depth): """Show the log report for one or multiple processes.""" - from aiida.cmdline.utils.common import get_calcjob_report, get_process_function_report, get_workchain_report + from aiida.cmdline.utils.common import ( + get_calcjob_report, + get_process_function_report, + get_workchain_report, + ) from aiida.orm import CalcFunctionNode, CalcJobNode, WorkChainNode, WorkFunctionNode if processes and most_recent_node: @@ -266,9 +288,20 @@ def process_report(processes, most_recent_node, levelname, indent_size, max_dept @verdi_process.command('status') @options.MOST_RECENT_NODE() -@click.option('-c', '--call-link-label', 'call_link_label', is_flag=True, help='Include the call link label if set.') @click.option( - '-m', '--max-depth', 'max_depth', type=int, default=None, help='Limit the number of levels to be printed.' + '-c', + '--call-link-label', + 'call_link_label', + is_flag=True, + help='Include the call link label if set.', +) +@click.option( + '-m', + '--max-depth', + 'max_depth', + type=int, + default=None, + help='Limit the number of levels to be printed.', ) @arguments.PROCESSES() def process_status(call_link_label, most_recent_node, max_depth, processes): @@ -300,7 +333,10 @@ def process_kill(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') + raise click.BadOptionUsage( + 'all', + 'cannot specify individual processes and the `--all` flag at the same time.', + ) if all_entries: click.confirm('Are you sure you want to kill all processes?', abort=True) @@ -308,7 +344,13 @@ def process_kill(processes, all_entries, timeout, wait): with capture_logging() as stream: try: message = 'Killed through `verdi process kill`' - control.kill_processes(processes, all_entries=all_entries, timeout=timeout, wait=wait, message=message) + control.kill_processes( + processes, + all_entries=all_entries, + timeout=timeout, + wait=wait, + message=message, + ) except control.ProcessTimeoutException as exception: echo.echo_critical(f'{exception}\n{REPAIR_INSTRUCTIONS}') @@ -327,12 +369,21 @@ def process_pause(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') + raise click.BadOptionUsage( + 'all', + 'cannot specify individual processes and the `--all` flag at the same time.', + ) with capture_logging() as stream: try: message = 'Paused through `verdi process pause`' - control.pause_processes(processes, all_entries=all_entries, timeout=timeout, wait=wait, message=message) + control.pause_processes( + processes, + all_entries=all_entries, + timeout=timeout, + wait=wait, + message=message, + ) except control.ProcessTimeoutException as exception: echo.echo_critical(f'{exception}\n{REPAIR_INSTRUCTIONS}') @@ -351,7 +402,10 @@ def process_play(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') + raise click.BadOptionUsage( + 'all', + 'cannot specify individual processes and the `--all` flag at the same time.', + ) with capture_logging() as stream: try: @@ -426,7 +480,11 @@ def process_repair(manager, broker, dry_run): daemon worker to complete it and will effectively be "stuck". Any process task that does not correspond to an active process is useless and should be discarded. Finally, duplicate process tasks are also problematic and are discarded. """ - from aiida.engine.processes.control import get_active_processes, get_process_tasks, iterate_process_tasks + from aiida.engine.processes.control import ( + get_active_processes, + get_process_tasks, + iterate_process_tasks, + ) active_processes = get_active_processes(project='id') process_tasks = get_process_tasks(broker) @@ -481,3 +539,83 @@ def process_repair(manager, broker, dry_run): if pid not in set_process_tasks: process_controller.continue_process(pid) echo.echo_report(f'Revived process `{pid}`') + + +@verdi_process.command('dump') +@arguments.PROCESS() +@options.PATH() +@options.NO_NODE_INPUTS() +@options.INCLUDE_ATTRIBUTES() +@options.INCLUDE_EXTRAS() +@options.USE_PRESUBMIT() +@options.OVERWRITE() +def dump( + process, + path, + no_node_inputs, + include_attributes, + include_extras, + use_presubmit, + overwrite, +) -> None: + """Dump files involved in the execution of a process. + + Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are + contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus + mirrors the logical execution of the workflow, which can also be queried by running `verdi process status + ` on the command line. + + By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution + settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". + + When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in + principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the + job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also + note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run + separately. + + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA + node data for further inspection. + """ + + from aiida.orm import CalcJobNode + from aiida.tools.dumping.processes import ( + ProcessNodeYamlDumper, + generate_default_dump_path, + make_dump_readme, + workchain_dump, + ) + + # Allow the command to be run with `CalcJob`s and `WorkChain`s + if isinstance(process, CalcJobNode): + echo.echo_warning('Command called on `CalcJob`. Will dump, but you can also use `verdi calcjob dump` instead.') + + # Generate default parent folder + if str(path) == '.': + output_path = generate_default_dump_path(process_node=process) + + # Instantiate YamlDumper + processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) + + process_dumped = workchain_dump( + process_node=process, + output_path=output_path, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=processnode_dumper, + overwrite=overwrite, + ) + + # Create README in parent directory + # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and + # workchain_dump files such that they can also be used from within the Python API, not just via verdi + make_dump_readme(output_path=output_path, process_node=process) + + # Communicate success/failure of dumping + if process_dumped: + echo.echo_success( + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' + ) + else: + echo.echo_report(f'Problem dumping {process.__class__.__name__} <{process.pk}>.') diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 72545b2a9f..4bb5beb4b9 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -8,6 +8,8 @@ ########################################################################### """Module with pre-defined reusable commandline options that can be used as `click` decorators.""" +import pathlib + import click from aiida.brokers.rabbitmq.defaults import BROKER_DEFAULTS @@ -66,6 +68,8 @@ 'GROUP_CLEAR', 'HOSTNAME', 'IDENTIFIER', + 'INCLUDE_ATTRIBUTES', + 'INCLUDE_EXTRAS', 'INPUT_FORMAT', 'INPUT_PLUGIN', 'LABEL', @@ -73,10 +77,13 @@ 'MOST_RECENT_NODE', 'NODE', 'NODES', + 'NO_NODE_INPUTS', 'NON_INTERACTIVE', 'OLDER_THAN', 'ORDER_BY', 'ORDER_DIRECTION', + 'OVERWRITE', + 'PATH', 'PAST_DAYS', 'PAUSED', 'PORT', @@ -102,6 +109,7 @@ 'USER_FIRST_NAME', 'USER_INSTITUTION', 'USER_LAST_NAME', + 'USE_PRESUBMIT', 'VERBOSITY', 'VISUALIZATION_FORMAT', 'WAIT', @@ -743,3 +751,58 @@ def set_log_level(_ctx, _param, value): is_flag=True, help='Print the full traceback in case an exception is raised.', ) + +PATH = OverridableOption( + '-p', + '--path', + type=click.Path(path_type=pathlib.Path), + default=pathlib.Path(), + show_default=False, + help='The directory in which the dumping folder will be created.', +) + +NO_NODE_INPUTS = OverridableOption( + '--no-node-inputs', + '-n', + is_flag=True, + default=False, + show_default=True, + help='Turn off dumping of the input nodes of the `CalcJob`(s).', +) + +INCLUDE_ATTRIBUTES = OverridableOption( + '-a', + '--include-attributes', + is_flag=True, + default=False, + show_default=True, + help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +INCLUDE_EXTRAS = OverridableOption( + '-e', + '--include-extras', + is_flag=True, + default=False, + show_default=True, + help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) + +USE_PRESUBMIT = OverridableOption( + '--use-presubmit', + '-u', + is_flag=True, + default=False, + show_default=True, + help="""Use the `presubmit` method for dumping the files of the`CalcJob`. Note: this requires the corresponding + aiida-plugin to be installed.""", +) + +OVERWRITE = OverridableOption( + '--overwrite', + '-o', + is_flag=True, + default=False, + show_default=True, + help="""Overwrite directory if it already exists.""", +) diff --git a/src/aiida/engine/daemon/execmanager.py b/src/aiida/engine/daemon/execmanager.py index 305dd174b7..2eb6a5ff33 100644 --- a/src/aiida/engine/daemon/execmanager.py +++ b/src/aiida/engine/daemon/execmanager.py @@ -25,7 +25,7 @@ from aiida.common import AIIDA_LOGGER, exceptions from aiida.common.datastructures import CalcInfo, FileCopyOperation -from aiida.common.folders import SandboxFolder +from aiida.common.folders import Folder, SandboxFolder from aiida.common.links import LinkType from aiida.engine.processes.exit_code import ExitCode from aiida.manage.configuration import get_config_option @@ -66,7 +66,7 @@ def upload_calculation( node: CalcJobNode, transport: Transport, calc_info: CalcInfo, - folder: SandboxFolder, + folder: Folder, inputs: Optional[MappingType[str, Any]] = None, dry_run: bool = False, ) -> RemoteData | None: diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py new file mode 100644 index 0000000000..0bc76162d8 --- /dev/null +++ b/src/aiida/tools/dumping/processes.py @@ -0,0 +1,455 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Functions for dumping of workchains and calcjobs.""" + +from __future__ import annotations + +import textwrap +from pathlib import Path +from typing import Union + +import yaml + +from aiida.cmdline.utils import echo +from aiida.common import LinkType +from aiida.common.exceptions import NotExistentAttributeError +from aiida.common.folders import Folder +from aiida.engine.daemon.execmanager import upload_calculation +from aiida.engine.processes.calcjobs import CalcJob +from aiida.engine.utils import instantiate_process +from aiida.manage import get_manager +from aiida.orm import ( + CalcFunctionNode, + CalcJobNode, + FolderData, + ProcessNode, + SinglefileData, + WorkChainNode, + WorkFunctionNode, +) +from aiida.orm.utils import LinkTriple +from aiida.transports.plugins.local import LocalTransport + + +class ProcessNodeYamlDumper: + """Utility class to dump selected `ProcessNode` properties and, optionally, attributes and extras to yaml.""" + + NODE_PROPERTIES = [ + 'label', + 'description', + 'pk', + 'uuid', + 'ctime', + 'mtime', + 'node_type', + 'process_type', + 'is_finished_ok', + ] + + USER_PROPERTIES = ('first_name', 'last_name', 'email', 'institution') + + COMPUTER_PROPERTIES = ('label', 'hostname', 'scheduler_type', 'transport_type') + + def __init__(self, include_attributes: bool = True, include_extras: bool = True): + self.include_attributes = include_attributes + self.include_extras = include_extras + + def dump_yaml( + self, + process_node: ProcessNode, + output_path: Path, + output_filename: str = '.aiida_node_metadata.yaml', + ) -> None: + """ + Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. + + :param process_node: The ProcessNode to dump. + :param output_path: The path to the directory where the yaml file will be saved. + :param output_filename: The name of the output yaml file. Defaults to `.aiida_node_metadata.yaml`. + :return: None + """ + + node_dict = {} + metadata_dict = {} + + # Add actual node `@property`s to dictionary + for metadata_property in self.NODE_PROPERTIES: + metadata_dict[metadata_property] = getattr(process_node, metadata_property) + + node_dict['Node data'] = metadata_dict + + # Add user data + try: + node_dbuser = process_node.user + user_dict = {} + for user_property in self.USER_PROPERTIES: + user_dict[user_property] = getattr(node_dbuser, user_property) + node_dict['User data'] = user_dict + except AttributeError: + pass + + # Add computer data + try: + node_dbcomputer = process_node.computer + computer_dict = {} + for computer_property in self.COMPUTER_PROPERTIES: + computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) + node_dict['Computer data'] = computer_dict + except AttributeError: + pass + + # Add node attributes + if self.include_attributes is True: + node_attributes = process_node.base.attributes.all + node_dict['Node attributes'] = node_attributes + + # Add node extras + if self.include_extras is True: + node_extras = process_node.base.extras.all + if node_extras: + node_dict['Node extras'] = node_extras + + # Dump to file + output_file = output_path.resolve() / output_filename + with open(output_file, 'w') as handle: + yaml.dump(node_dict, handle, sort_keys=False) + + +# Utility functions +def make_dump_readme(process_node: ProcessNode, output_path: Path): + """Generate README file in main dumping directory. + + :param process_node: CalcJob or WorkChain Node. + :type process_node: ProcessNode + :param output_path: Output path for dumping. + :type output_path: Path + + """ + _readme_string = textwrap.dedent( + f"""\ + This directory contains the files involved in the simulation/workflow `{process_node.process_label} <{process_node.pk}>` run with AiiDA. + + Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are + contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus + mirrors the logical execution of the workflow, which can also be queried by running `verdi process status + {process_node.pk}` on the command line. + + By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution + settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". + + When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in + principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the + job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also + note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run + separately. + + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA + node data for further inspection.""" # noqa: E501 + ) + + # TODO: Add outputs of `verdi process (status|report|show?)` + + with open(output_path / 'README', 'w') as handle: + handle.write(_readme_string) + + +def generate_default_dump_path(process_node: Union[WorkChainNode, CalcJobNode]) -> Path: + """Simple helper function to generate the default parent-dumping directory if none given. + + This function is not called for the sub-calls of `calcjob_dump` or during the recursive `workchain_dump` as it just + creates the default parent folder for the dumping, if no name is given. + + :param process_node: The `ProcessNode` for which the directory is created. + :type process_node: Union[WorkChainNode, CalcJobNode] + :return: The created parent dump path. + :rtype: Path + """ + + try: + return Path(f'dump-{process_node.process_label}-{process_node.pk}') + except AttributeError: + # ? This case came up during testing, not sure how relevant it actually is + return Path(f'dump-{process_node.process_type}-{process_node.pk}') + + +# ? Could move code from `generate_calcjob_node_io` here to normal function +def attach_files_to_calcjob(): + pass + + +# ? Could move this to `cmdline/utils` +def validate_make_dump_path( + path: Path = Path(), + overwrite: bool = False, +) -> Path: + """ + Create default dumping directory for a given process node and return it as absolute path. + + :param process_node: The `ProcessNode` for which the directory is created. + :type process_node: Union[WorkChainNode, CalcJobNode] + :param path: The base path for the dump. Defaults to the current directory. + :type path: Path + :return: The created dump path. + :rtype: Path + """ + import shutil + + output_path = path.resolve() + + # ? Use of `echo.echo_` only when running via `verdi`? -> I only see it used in the `cmd_` files. + if str(path) == '.' and overwrite: + echo.echo_critical('Path not set, defaults to CWD. Will not delete here for safety.') + return output_path + + if output_path.is_dir() and any(output_path.iterdir()): + if overwrite: + # ? This might be a bit dangerous -> Check for it not being CWD enough? + # ? Added check for README in folder to decrease chances of deleting some other path + if (output_path / 'README').is_file(): # Also check for presence of README for safety + echo.echo_report(f'Overwrite set to true, will overwrite directory `{output_path}`.') + shutil.rmtree(output_path) + else: + echo.echo_critical(f'Something went wrong. Manually remove existing `{output_path}` and dump again.') + # echo.echo_critical(f'No README present in "{output_path}" Manually remove and dump again.') + else: + echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') + + output_path.mkdir(parents=True, exist_ok=False) + + return output_path + + +def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: + """Small helper function to generate the directory label for node inputs.""" + node = link_triple.node + link_label = link_triple.link_label + + # Generate directories with naming scheme akin to `verdi process status` + if link_label != 'CALL' and not link_label.startswith('iteration_'): + node_label = f'{index:02d}-{link_label}' + else: + node_label = f'{index:02d}' + + try: + process_label = node.process_label + if process_label is not None: + node_label += f'-{process_label}' + + except AttributeError: + process_type = node.process_type + if process_type is not None: + node_label += f'-{process_type}' + + # ? Add pk also to the sub-steps or only the parent dumping directory? + # node_label += f'-{node.pk}' + + return node_label + + +def calcjob_dump( + calcjob_node: CalcJobNode, + output_path: Path = Path(), + no_node_inputs: bool = False, + use_presubmit: bool = False, + node_dumper: ProcessNodeYamlDumper | None = None, + overwrite: bool = True, +) -> bool: + """ + Dump the contents of a CalcJobNode to a specified output path. + + :param calcjob_node: The CalcJobNode to be dumped. + :param output_path: The path where the dumped contents will be stored. + :param no_node_inputs: If True, do not dump the inputs of the CalcJobNode. + :param use_presubmit: If True, use the `prepare_for_submission` method to prepare the calculation for + submission. If False, use the retrieved outputs and raw inputs. + :return: None + """ + + output_path = validate_make_dump_path(path=output_path, overwrite=overwrite) + + if not use_presubmit: + # ? Outputs obtained via retrieved and should not be present when using `prepare_for_submission` as it puts the + # ? calculation in a state to be submitted ?! + calcjob_node.base.repository.copy_tree(output_path / Path('raw_inputs')) + try: + calcjob_node.outputs.retrieved.copy_tree(output_path / Path('raw_outputs')) + + # Might not have an output with link label `retrieved` + except NotExistentAttributeError: + pass + + if not no_node_inputs: + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path) + + else: + try: + calcjob_presubmit_dump(calcjob_node=calcjob_node, output_path=output_path) + except ValueError: + # raise + echo.echo_error( + 'Error when trying to get a restart-builder. Do you have the relevant aiida-plugin installed?' + ) + return False + + # This will eventually be replaced once pydantic backend PR merged + if node_dumper is None: + node_dumper = ProcessNodeYamlDumper() + node_dumper.dump_yaml(process_node=calcjob_node, output_path=output_path) + + return True + + +def workchain_dump( + process_node: Union[WorkChainNode, CalcJobNode], + output_path: Path = Path(), + no_node_inputs: bool = False, + use_presubmit: bool = False, + node_dumper: ProcessNodeYamlDumper | None = None, + overwrite: bool = True, +) -> bool: + """ + Dumps all data involved in a `WorkChainNode`, including its outgoing links. + + Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are + only actually created when a `CalcJobNode` is reached. + + :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. + :param output_path: The main output path where the directory tree will be created. + :param no_node_inputs: If True, do not include file or folder inputs in the dump. Defaults to False. + :param use_presubmit: If True, use the `prepare_for_submission` method to get the inputs of the + CalcJobNode. Defaults to False. + :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new + instance will be created. Defaults to None. + :return: bool + """ + + # Keep track of dumping success to be able to communicate it outwards + all_processes_dumped = True + + # ? Realized during testing: If no path provided, only for the sub-workchain an additional directory is created, but + # ? should also create one here, if the function is imported normally and used in Python scripts + output_path = validate_make_dump_path(path=output_path, overwrite=overwrite) + + # This will eventually be replaced once pydantic backend PR merged + if node_dumper is None: + node_dumper = ProcessNodeYamlDumper() + node_dumper.dump_yaml(process_node=process_node, output_path=output_path.resolve()) + + # Don't increment index for `ProcessNodes` that don't (always?) have file IO + # (`CalcFunctionNodes`/`WorkFunctionNodes`), such as `create_kpoints_from_distance` + called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + called_links = [ + called_link + for called_link in called_links + if not isinstance(called_link.node, (CalcFunctionNode, WorkFunctionNode)) + ] + + for index, link_triple in enumerate(sorted(called_links, key=lambda link_triple: link_triple.node.ctime), start=1): + child_node = link_triple.node + child_label = generate_node_input_label(index=index, link_triple=link_triple) + + output_path_child = output_path.resolve() / child_label + + # Recursive function call for `WorkChainNode`` + if isinstance(child_node, WorkChainNode): + process_dumped = workchain_dump( + process_node=child_node, + output_path=output_path_child, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + ) + + # Dump for `CalcJobNode` + elif isinstance(child_node, CalcJobNode): + process_dumped = calcjob_dump( + calcjob_node=child_node, + output_path=output_path_child, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + ) + + else: + process_dumped = False + + all_processes_dumped = all_processes_dumped and process_dumped + + return all_processes_dumped + + +# Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs +def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, inputs_relpath: Path = Path('node_inputs')): + """ + Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. + + :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. + :type calcjob_node: CalcJobNode + :param output_path: The path where the inputs will be dumped. + :type output_path: Path + """ + dump_types = (SinglefileData, FolderData) + + # ? Not using the `node_class` argument of `get_incoming`, as it does not actually retrieve, e.g. a `UpfData` node + # ? (due to planned deprecation?) + # ? Instead, check for isinstance of `SinglefileData` + input_node_triples = calcjob_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + + for input_node_triple in input_node_triples: + # Select only repositories that hold objects and are of the selected dump_types + if len(input_node_triple.node.base.repository.list_objects()) > 0 and isinstance( + input_node_triple.node, dump_types + ): + # input_node_path = output_path / Path('node_inputs') / Path(input_node_triple.link_label) + input_node_path = output_path / inputs_relpath / Path(*input_node_triple.link_label.split('__')) + input_node_triple.node.base.repository.copy_tree(input_node_path) + + +def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): + """ + Dump inputs of a `CalcJobNode` using the `presubmit` function. + + :param process: The `CalcJobNode` whose inputs need to be dumped. + :type process: CalcJobNode + :param output_path: The path where the inputs will be dumped. + :type output_path: Path + """ + + builder_restart = calcjob_node.get_builder_restart() + runner = get_manager().get_runner() + calcjob_process: CalcJob = instantiate_process(runner, builder_restart) + + # `presubmit` calls `prepare_for_submission` internally + calc_info = calcjob_process.presubmit(folder=Folder(abspath=output_path)) + + try: + # Hackish way to modify local copy list so that the pseudos are actually dumped where I want them to. Otherwise + # they + # end up in home... + local_copy_list = calc_info['local_copy_list'].copy() + # print('LOCAL_COPY_LIST', local_copy_list) + new_local_copy_list = [tuple(list(local_copy_list[0][:2]) + [str(output_path / local_copy_list[0][-1])])] + calc_info['local_copy_list'] = new_local_copy_list + # print('NEW_LOCAL_COPY_LIST', new_local_copy_list) + except IndexError: + # This happens if `local_copy_list` is empty + pass + + local_transport = LocalTransport().open() + new_calcjob_node = calcjob_process.node # type: ignore + new_calcjob_node: CalcJobNode = calcjob_process.node + upload_calculation( + node=new_calcjob_node, + transport=local_transport, + calc_info=calc_info, + folder=Folder(abspath=output_path), + inputs=calcjob_process.inputs, + dry_run=False, + ) diff --git a/tests/conftest.py b/tests/conftest.py index 936794b5e2..c2fa03dd01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,7 @@ import copy import dataclasses +import io import os import pathlib import types @@ -25,7 +26,9 @@ import click import pytest from aiida import get_profile +from aiida.common.links import LinkType from aiida.manage.configuration import Profile, get_config, load_profile +from aiida.orm import FolderData, SinglefileData if t.TYPE_CHECKING: from aiida.manage.configuration.config import Config @@ -671,3 +674,137 @@ def reset_log_level(): log.CLI_ACTIVE = None log.CLI_LOG_LEVEL = None log.configure_logging(with_orm=True) + + +@pytest.fixture +def generate_calcjob_node_io(): + def _generate_calcjob_node_io( + attach_repository: bool = True, attach_node_inputs: bool = True, attach_outputs: bool = True, entry_point=None + ): + # `entry_point='aiida.calculations:core.arithmetic.add'` leads to error due to the unknown names of the attached + # nodes, which are not defined in the `Calculation` itself + # calcjob_node = generate_calculation_node(entry_point=entry_point) + + from aiida.orm import CalcJobNode + + calcjob_node = CalcJobNode() + + filename: str = 'file.txt' + + # Attach raw inputs to node repository + if attach_repository: + calcjob_node.base.repository.put_object_from_filelike(io.StringIO('a'), path=filename) + + # Attach node inputs + if attach_node_inputs: + # Set up labels + singlefiledata_linklabel: str = 'singlefile_input' + folderdata_linklabel: str = 'folderdata_input' + + # Generate SinglefileData, Folderdata, and CalcJobNode nodes + singlefiledata_node = SinglefileData.from_string(content='a', filename=filename) + singlefiledata_node.store() + + folderdata_node = FolderData() + folderdata_node.put_object_from_filelike(io.StringIO('a'), path=pathlib.Path('relative_path') / filename) + folderdata_node.store() + + calcjob_node.base.links.add_incoming( + singlefiledata_node, + link_type=LinkType.INPUT_CALC, + link_label=singlefiledata_linklabel, + ) + + calcjob_node.base.links.add_incoming( + folderdata_node, link_type=LinkType.INPUT_CALC, link_label=folderdata_linklabel + ) + + # Attach `retrieved` outputs + if attach_outputs: + # Not storing gives: + # `aiida.common.exceptions.ModificationNotAllowed: Cannot store because source node of link triple + # LinkTriple(...) is not stored`` + calcjob_node.store() + + retrieved_node = FolderData() + retrieved_node.put_object_from_filelike(io.StringIO('a'), path=filename) + retrieved_node.base.links.add_incoming(calcjob_node, link_type=LinkType.CREATE, link_label='retrieved') + retrieved_node.store() + + return calcjob_node + + return _generate_calcjob_node_io + + +@pytest.fixture +def generate_arithmetic_add_node(): + def _generate_arithmetic_add_node(computer): + from aiida.engine import run_get_node + from aiida.orm import InstalledCode, Int + from aiida.plugins import CalculationFactory + + arithmetic_add = CalculationFactory('core.arithmetic.add') + + add_inputs = { + 'x': Int(1), + 'y': Int(2), + 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + } + + _, add_node = run_get_node(arithmetic_add, **add_inputs) + + return add_node + + return _generate_arithmetic_add_node + + +@pytest.fixture +def generate_multiply_add_node(): + def _generate_multiply_add_node(computer): + from aiida.engine import run_get_node + from aiida.orm import InstalledCode, Int + from aiida.plugins import WorkflowFactory + + multiplyaddworkchain = WorkflowFactory('core.arithmetic.multiply_add') + + multiply_add_inputs = { + 'x': Int(1), + 'y': Int(2), + 'z': Int(3), + 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + } + + _, multiply_add_node = run_get_node(multiplyaddworkchain, **multiply_add_inputs) + + return multiply_add_node + + return _generate_multiply_add_node + + +@pytest.fixture +def generate_work_chain_io(generate_calcjob_node_io): + def _generate_work_chain_io(): + """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `CalcJob` with file io.""" + from aiida.orm import WorkChainNode + + wc_node = WorkChainNode() + wc_node_sub = WorkChainNode() + cj_node = generate_calcjob_node_io(attach_outputs=False) + + # wc_node.process_label = 'main' + # wc_node_sub.process_label = 'sub' + # cj_node.process_label = 'calc' + + # Add sub-workchain that calls a calcjob + wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workchain') + cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calcjob') + + # Need to store nodes so that the relationships are being picked up in the `get_outgoing` call (for + # `get_incoming` they work without being stored) + wc_node.store() + wc_node_sub.store() + cj_node.store() + + return wc_node + + return _generate_work_chain_io diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py new file mode 100644 index 0000000000..f1ddf04997 --- /dev/null +++ b/tests/tools/dumping/test_processes.py @@ -0,0 +1,235 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for the dumping of ProcessNode data to disk.""" + +# ? For testing the dumping, one either needs to cd into the tmp_path, or pass the tmp_path as argument, otherwise, the +# ? files are dumped into src -> CWD from where the script is run. +# ? However, when one passes tmp_dir as output_path, no automatic path is created, as not at the default value is set, +# ? so str(output_path) == '.' is False + +import pathlib +import shutil +from pathlib import Path + +from aiida.tools.dumping.processes import ( + calcjob_dump, + calcjob_node_inputs_dump, + generate_default_dump_path, + generate_node_input_label, + workchain_dump, +) + +filename = 'file.txt' + +# Define some variables used for the dumping +node_inputs_relpath = 'node_inputs' +singlefiledata_linklabel = 'singlefile_input' +folderdata_linklabel = 'folderdata_input' +singlefiledata_path = pathlib.Path(f'{node_inputs_relpath}/{singlefiledata_linklabel}') +folderdata_path = pathlib.Path(f'{node_inputs_relpath}/{folderdata_linklabel}/relative_path') + + +# ? Move this somewhere else +def clean_tmp_path(tmp_path: Path): + """ + Recursively delete files and directories in a path, e.g. a temporary path used by pytest. + # ? This empties the directory, as intended for the general dumping directory, but doesn't delete it itself + """ + + for item in tmp_path.iterdir(): + if item.is_dir(): + shutil.rmtree(item) + else: + item.unlink() + + +def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): + """Test that dumping of CalcJob node inputs works correctly.""" + + calcjob_node = generate_calcjob_node_io() + + # Run the dumping + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path, inputs_relpath=node_inputs_relpath) + + # Test the dumping results + singlefiledata_outputpath = pathlib.Path(tmp_path / singlefiledata_path) + singlefiledata_outputfile = singlefiledata_outputpath / filename + + folderdata_outputpath = pathlib.Path(tmp_path / folderdata_path) + folderdata_outputfile = folderdata_outputpath / filename + + assert singlefiledata_outputpath.is_dir() + assert singlefiledata_outputfile.is_file() + with open(singlefiledata_outputfile, 'r') as handle: + assert handle.read() == 'a' + + assert folderdata_outputpath.is_dir() + assert folderdata_outputfile.is_file() + with open(folderdata_outputfile, 'r') as handle: + assert handle.read() == 'a' + + +def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): + dump_path = tmp_path / 'calcjob_dump_io' + + singlefiledata_outputpath = pathlib.Path(dump_path / singlefiledata_path) + singlefiledata_outputfile = singlefiledata_outputpath / filename + + calcjob_node = generate_calcjob_node_io() + raw_input_file = dump_path / 'raw_inputs' / filename + raw_output_file = dump_path / 'raw_outputs' / filename + + # Normal dumping + result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path) + assert singlefiledata_outputfile.is_file() + + # Checking the actual content should be handled by `test_copy_tree` + assert raw_input_file.is_file() + assert raw_output_file.is_file() + assert result + + clean_tmp_path(tmp_path=tmp_path) + + # Don't dump the connected node inputs + result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, no_node_inputs=True) + assert not singlefiledata_outputfile.is_file() + assert result + + clean_tmp_path(tmp_path=tmp_path) + + # use_presubmit -> Depends on implementation from aiida-plugin, so I cannot test specifically inside aiida-core + # Assert that `False` is returned if `use_presubmit` used, but no `process_type` has been set. This is the test case + # one gets from the fixture (f'no process type for Node<{self.pk}>: cannot recreate process class') + result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, use_presubmit=True) + assert result is False + + clean_tmp_path(tmp_path=tmp_path) + + +def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): + dump_path = tmp_path / 'calcjob_dump_arithmetic_add' + + # Now, generate `CalcJobNode` from ArithmeticAddCalculation + add_node = generate_arithmetic_add_node(computer=aiida_localhost) + + # Normal dumping of ArithmeticAddCalculation node + result = calcjob_dump(calcjob_node=add_node, output_path=dump_path) + assert result + + raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + raw_input_files = [dump_path / 'raw_inputs' / raw_input_file for raw_input_file in raw_input_files] + raw_output_files = [dump_path / 'raw_outputs' / raw_output_file for raw_output_file in raw_output_files] + + assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) + assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + + clean_tmp_path(tmp_path=tmp_path) + + # Dumping with `use_presubmit` -> Directory structure is different and output file not dumped + result = calcjob_dump(calcjob_node=add_node, output_path=dump_path, use_presubmit=True) + + assert result + assert Path(dump_path / '_aiidasubmit.sh').is_file() + assert Path(dump_path / 'aiida.in').is_file() + assert not Path(dump_path / 'aiida.out').is_file() + + +def test_workchain_dump_io(generate_work_chain_io, tmp_path): + wc_node = generate_work_chain_io() + + dump_parent_path = tmp_path / 'wc-dump-test' + + raw_input_path = '01-sub_workchain/01-calcjob/raw_inputs/file.txt' + singlefiledata_path = '01-sub_workchain/01-calcjob/node_inputs/singlefile_input/file.txt' + folderdata_path = '01-sub_workchain/01-calcjob/node_inputs/folderdata_input/relative_path/file.txt' + node_metadata_paths = [ + '.aiida_node_metadata.yaml', + '01-sub_workchain/.aiida_node_metadata.yaml', + '01-sub_workchain/01-calcjob/.aiida_node_metadata.yaml', + ] + # Don't test for `README` here, as this is only created when dumping is done via `verdi` + expected_files = [raw_input_path, singlefiledata_path, folderdata_path, *node_metadata_paths] + expected_files = [dump_parent_path / expected_file for expected_file in expected_files] + + # Here, when setting `output_path=tmp_path`, no parent directory for the parent workchain is created + # Therefore, go into tmp-directory used for testing, without specifying output path -> Closer to how people might + # actually use the function + result = workchain_dump(process_node=wc_node, output_path=dump_parent_path) + + assert result + assert all([expected_file.is_file() for expected_file in expected_files]) + + +def test_workchain_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): + # Still set directory fixed to make dump directory reproducible (it should be anyway, but contains e.g. the pk) + dump_parent_path = tmp_path / 'multiply_add-dump-test' + + # Now test for output from running MultiplyAddWorkChain + multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) + + result = workchain_dump(process_node=multiply_add_node, output_path=dump_parent_path) + assert result + + raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + raw_input_files = [ + dump_parent_path / '01-ArithmeticAddCalculation' / 'raw_inputs' / raw_input_file + for raw_input_file in raw_input_files + ] + raw_output_files = [ + dump_parent_path / '01-ArithmeticAddCalculation' / 'raw_outputs' / raw_output_file + for raw_output_file in raw_output_files + ] + + assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) + assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + + +def test_generate_default_dump_path(generate_arithmetic_add_node, generate_multiply_add_node, aiida_localhost): + add_node = generate_arithmetic_add_node(computer=aiida_localhost) + multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) + add_path = generate_default_dump_path(process_node=add_node) + multiply_add_path = generate_default_dump_path(process_node=multiply_add_node) + + # ? Possible to reset db here to make pks reproducible? + assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' + assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' + + +def test_generate_node_input_label(generate_multiply_add_node, generate_work_chain_io, aiida_localhost): + # Test with MultiplyAddWorkChain inputs and outputs + multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) + input_triples = multiply_add_node.base.links.get_incoming().all() + input_labels = [generate_node_input_label(_, input_node) for _, input_node in enumerate(input_triples)] + assert input_labels == ['00-x', '01-y', '02-z', '03-code'] + + output_triples = multiply_add_node.base.links.get_outgoing().all() + output_labels = [generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)] + assert output_labels == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] + + # Test with manually constructed, more complex workchain + wc_node = generate_work_chain_io() + wc_output_triples = wc_node.base.links.get_outgoing().all() + sub_wc_node = wc_output_triples[0].node + + output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() + + output_labels = [generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)] + assert output_labels == ['00-sub_workchain', '01-calcjob'] + + # ? Not really testing for more complex cases that actually contain 'CALL' or 'iteration_' here + + +def test_validate_make_dump_path(chdir_tmp_path): + pass + + +def test_dump_yaml(): + pass From d8bb81845a1ad2405d04f6ceb6084a6dffa5bcf0 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 11 Apr 2024 16:07:29 +0200 Subject: [PATCH 02/30] CLI: Add dumping functionality. --- docs/source/reference/command_line.rst | 2 ++ src/aiida/cmdline/commands/cmd_devel.py | 1 - src/aiida/tools/dumping/processes.py | 3 +-- tests/tools/dumping/test_processes.py | 8 -------- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 6822df9f0f..6dafac22c3 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -48,6 +48,7 @@ Below is a list with all available subcommands. Commands: cleanworkdir Clean all content of all output remote folders of calcjobs. + dump Dump files involved in the execution of a `CalcJob`. gotocomputer Open a shell in the remote folder on the calcjob. inputcat Show the contents of one of the calcjob input files. inputls Show the list of the generated calcjob input files. @@ -367,6 +368,7 @@ Below is a list with all available subcommands. Commands: call-root Show root process of the call stack for the given processes. + dump Dump files involved in the execution of a process. kill Kill running processes. list Show a list of running or terminated processes. pause Pause running processes. diff --git a/src/aiida/cmdline/commands/cmd_devel.py b/src/aiida/cmdline/commands/cmd_devel.py index 25fa2cb6d0..d42f0f1451 100644 --- a/src/aiida/cmdline/commands/cmd_devel.py +++ b/src/aiida/cmdline/commands/cmd_devel.py @@ -39,7 +39,6 @@ def devel_check_load_time(): from aiida.manage import get_manager loaded_aiida_modules = [key for key in sys.modules if key.startswith('aiida.')] - # print(loaded_aiida_modules) aiida_modules_str = '\n- '.join(sorted(loaded_aiida_modules)) echo.echo_info(f'aiida modules loaded:\n- {aiida_modules_str}') diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 0bc76162d8..f1422a5125 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -424,7 +424,7 @@ def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): builder_restart = calcjob_node.get_builder_restart() runner = get_manager().get_runner() - calcjob_process: CalcJob = instantiate_process(runner, builder_restart) + calcjob_process: CalcJob = instantiate_process(runner, builder_restart) # type: ignore[assignment] # `presubmit` calls `prepare_for_submission` internally calc_info = calcjob_process.presubmit(folder=Folder(abspath=output_path)) @@ -443,7 +443,6 @@ def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): pass local_transport = LocalTransport().open() - new_calcjob_node = calcjob_process.node # type: ignore new_calcjob_node: CalcJobNode = calcjob_process.node upload_calculation( node=new_calcjob_node, diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index f1ddf04997..ed3f47c4f0 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -205,14 +205,6 @@ def test_generate_default_dump_path(generate_arithmetic_add_node, generate_multi def test_generate_node_input_label(generate_multiply_add_node, generate_work_chain_io, aiida_localhost): # Test with MultiplyAddWorkChain inputs and outputs - multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - input_triples = multiply_add_node.base.links.get_incoming().all() - input_labels = [generate_node_input_label(_, input_node) for _, input_node in enumerate(input_triples)] - assert input_labels == ['00-x', '01-y', '02-z', '03-code'] - - output_triples = multiply_add_node.base.links.get_outgoing().all() - output_labels = [generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)] - assert output_labels == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] # Test with manually constructed, more complex workchain wc_node = generate_work_chain_io() From 19922fd39b5a5e172257a93787478678cd215602 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 11 Apr 2024 18:00:20 +0200 Subject: [PATCH 03/30] Fix process dump and docstring formatting --- src/aiida/cmdline/commands/cmd_process.py | 35 +++++++++++++++-------- src/aiida/tools/__init__.py | 1 + src/aiida/tools/dumping/__init__.py | 13 +++++++++ src/aiida/tools/dumping/processes.py | 18 ++++-------- 4 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 5997897179..bd4024adef 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -582,15 +582,12 @@ def dump( from aiida.orm import CalcJobNode from aiida.tools.dumping.processes import ( ProcessNodeYamlDumper, + calcjob_dump, generate_default_dump_path, make_dump_readme, workchain_dump, ) - # Allow the command to be run with `CalcJob`s and `WorkChain`s - if isinstance(process, CalcJobNode): - echo.echo_warning('Command called on `CalcJob`. Will dump, but you can also use `verdi calcjob dump` instead.') - # Generate default parent folder if str(path) == '.': output_path = generate_default_dump_path(process_node=process) @@ -598,14 +595,28 @@ def dump( # Instantiate YamlDumper processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - process_dumped = workchain_dump( - process_node=process, - output_path=output_path, - no_node_inputs=no_node_inputs, - use_presubmit=use_presubmit, - node_dumper=processnode_dumper, - overwrite=overwrite, - ) + # Allow the command to be run with `CalcJob`s and `WorkChain`s + if isinstance(process, CalcJobNode): + echo.echo_warning('Command called on `CalcJob`. Will dump, but you can also use `verdi calcjob dump` instead.') + + process_dumped = calcjob_dump( + calcjob_node=process, + output_path=output_path, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=processnode_dumper, + overwrite=overwrite, + ) + + else: + process_dumped = workchain_dump( + process_node=process, + output_path=output_path, + no_node_inputs=no_node_inputs, + use_presubmit=use_presubmit, + node_dumper=processnode_dumper, + overwrite=overwrite, + ) # Create README in parent directory # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and diff --git a/src/aiida/tools/__init__.py b/src/aiida/tools/__init__.py index a9ab4e5762..9c238fd858 100644 --- a/src/aiida/tools/__init__.py +++ b/src/aiida/tools/__init__.py @@ -24,6 +24,7 @@ from .calculations import * from .data import * +from .dumping import * from .graph import * from .groups import * from .visualization import * diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index e69de29bb2..068e5a8291 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -0,0 +1,13 @@ +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Modules related to the dumping of AiiDA data.""" + +__all__ = ('processes',) + +# from .processes import * diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index f1422a5125..63f19a7455 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -12,7 +12,6 @@ import textwrap from pathlib import Path -from typing import Union import yaml @@ -66,8 +65,7 @@ def dump_yaml( output_path: Path, output_filename: str = '.aiida_node_metadata.yaml', ) -> None: - """ - Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. + """Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. :param process_node: The ProcessNode to dump. :param output_path: The path to the directory where the yaml file will be saved. @@ -160,14 +158,14 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): handle.write(_readme_string) -def generate_default_dump_path(process_node: Union[WorkChainNode, CalcJobNode]) -> Path: +def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. This function is not called for the sub-calls of `calcjob_dump` or during the recursive `workchain_dump` as it just creates the default parent folder for the dumping, if no name is given. :param process_node: The `ProcessNode` for which the directory is created. - :type process_node: Union[WorkChainNode, CalcJobNode] + :type process_node: WorkChainNode | CalcJobNode :return: The created parent dump path. :rtype: Path """ @@ -192,8 +190,6 @@ def validate_make_dump_path( """ Create default dumping directory for a given process node and return it as absolute path. - :param process_node: The `ProcessNode` for which the directory is created. - :type process_node: Union[WorkChainNode, CalcJobNode] :param path: The base path for the dump. Defaults to the current directory. :type path: Path :return: The created dump path. @@ -307,15 +303,14 @@ def calcjob_dump( def workchain_dump( - process_node: Union[WorkChainNode, CalcJobNode], + process_node: WorkChainNode | CalcJobNode, output_path: Path = Path(), no_node_inputs: bool = False, use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, ) -> bool: - """ - Dumps all data involved in a `WorkChainNode`, including its outgoing links. + """Dumps all data involved in a `WorkChainNode`, including its outgoing links. Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are only actually created when a `CalcJobNode` is reached. @@ -387,8 +382,7 @@ def workchain_dump( # Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, inputs_relpath: Path = Path('node_inputs')): - """ - Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. + """Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. :type calcjob_node: CalcJobNode From 863cc61ff14ef8dd60a1499e45efe8b043cf20cd Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 15 Apr 2024 10:45:28 +0200 Subject: [PATCH 04/30] Echo missing plugin for `--use-presubmit` --- src/aiida/tools/dumping/processes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 63f19a7455..3c19c08111 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -289,8 +289,12 @@ def calcjob_dump( calcjob_presubmit_dump(calcjob_node=calcjob_node, output_path=output_path) except ValueError: # raise + # missing_plugin = str(calcjob_node.process_class).split(' ')[1].split('.')[0][1:] -> .process_class leads + # to exception without plugin installed + missing_plugin = f'aiida-{calcjob_node.process_type.split(':')[1].split('.')[0]}' echo.echo_error( - 'Error when trying to get a restart-builder. Do you have the relevant aiida-plugin installed?' + f'Error when trying to get a restart-builder. Do you have the relevant ' + f'plugin `{missing_plugin}` installed?' ) return False From c2244af9c07f1dfc9878ca865b6f397ec8facf4e Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 18 Apr 2024 18:49:58 +0200 Subject: [PATCH 05/30] Some cleanup and refactor - Reverted formatting changes in `cmd_process` - Removed unneccessary comments - Removed `:type` if type annotations present - `no_node_inputs` -> `include_inputs` to be consistent with `include_attributes`/`include_extras` - Don't set default path (rather than pwd) and check for None --- src/aiida/cmdline/commands/cmd_calcjob.py | 20 ++--- src/aiida/cmdline/commands/cmd_process.py | 99 +++++------------------ src/aiida/cmdline/params/options/main.py | 13 ++- src/aiida/tools/dumping/processes.py | 42 +++------- tests/tools/dumping/test_processes.py | 2 +- 5 files changed, 43 insertions(+), 133 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_calcjob.py b/src/aiida/cmdline/commands/cmd_calcjob.py index e798200adc..1d122caa5a 100644 --- a/src/aiida/cmdline/commands/cmd_calcjob.py +++ b/src/aiida/cmdline/commands/cmd_calcjob.py @@ -353,7 +353,7 @@ def get_remote_and_path(calcjob, path=None): @verdi_calcjob.command('dump') @arguments.CALCULATION('calcjob', type=CalculationParamType(sub_classes=('aiida.node:process.calculation.calcjob',))) @options.PATH() -@options.NO_NODE_INPUTS() +@options.INCLUDE_INPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() @options.USE_PRESUBMIT() @@ -361,7 +361,7 @@ def get_remote_and_path(calcjob, path=None): def dump( calcjob, path, - no_node_inputs, + include_inputs, include_attributes, include_extras, use_presubmit, @@ -369,12 +369,7 @@ def dump( ) -> None: """Dump files involved in the execution of a `CalcJob`. - Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are - contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus - mirrors the logical execution of the workflow, which can also be queried by running `verdi process status - ` on the command line. - - By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + By default, input and output files can be found in the corresponding "raw_inputs" and "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". @@ -395,19 +390,17 @@ def dump( make_dump_readme, ) - # Generate default parent folder - if str(path) == '.': + if path is None: output_path = generate_default_dump_path(process_node=calcjob) else: output_path = path.resolve() - # Instantiate YamlDumper calcjobnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) calcjob_dumped = calcjob_dump( calcjob_node=calcjob, output_path=output_path, - no_node_inputs=no_node_inputs, + include_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=calcjobnode_dumper, overwrite=overwrite, @@ -418,10 +411,9 @@ def dump( # workchain_dump files such that they can also be used from within the Python API, not just via verdi make_dump_readme(output_path=output_path, process_node=calcjob) - # Communicate success/failure of dumping if calcjob_dumped: echo.echo_success( - f'Raw files for {calcjob.__class__.__name__} <{calcjob.pk}> dumped successfully in `{output_path}`.' + f'Raw files for {calcjob.__class__.__name__} <{calcjob.pk}> dumped in `{output_path}`.' ) else: echo.echo_report(f'Problem dumping {calcjob.__class__.__name__} <{calcjob.pk}>.') diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index bd4024adef..f3f6794d91 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -114,11 +114,7 @@ def process_list( builder = CalculationQueryBuilder() filters = builder.get_filters(all_entries, process_state, process_label, paused, exit_status, failed) query_set = builder.get_query_set( - relationships=relationships, - filters=filters, - order_by={order_by: order_dir}, - past_days=past_days, - limit=limit, + relationships=relationships, filters=filters, order_by={order_by: order_dir}, past_days=past_days, limit=limit ) projected = builder.get_projected(query_set, projections=project) headers = projected.pop(0) @@ -165,10 +161,7 @@ def process_list( slots_per_worker = ctx.obj.config.get_option('daemon.worker_process_slots', scope=ctx.obj.profile.name) active_processes = ( QueryBuilder() - .append( - ProcessNode, - filters={'attributes.process_state': {'in': ('created', 'waiting', 'running')}}, - ) + .append(ProcessNode, filters={'attributes.process_state': {'in': ('created', 'waiting', 'running')}}) .count() ) available_slots = active_workers * slots_per_worker @@ -234,13 +227,7 @@ def process_call_root(processes): @verdi_process.command('report') @arguments.PROCESSES() @options.MOST_RECENT_NODE() -@click.option( - '-i', - '--indent-size', - type=int, - default=2, - help='Set the number of spaces to indent each level by.', -) +@click.option('-i', '--indent-size', type=int, default=2, help='Set the number of spaces to indent each level by.') @click.option( '-l', '--levelname', @@ -249,21 +236,12 @@ def process_call_root(processes): help='Filter the results by name of the log level.', ) @click.option( - '-m', - '--max-depth', - 'max_depth', - type=int, - default=None, - help='Limit the number of levels to be printed.', + '-m', '--max-depth', 'max_depth', type=int, default=None, help='Limit the number of levels to be printed.' ) @decorators.with_dbenv() def process_report(processes, most_recent_node, levelname, indent_size, max_depth): """Show the log report for one or multiple processes.""" - from aiida.cmdline.utils.common import ( - get_calcjob_report, - get_process_function_report, - get_workchain_report, - ) + from aiida.cmdline.utils.common import get_calcjob_report, get_process_function_report, get_workchain_report from aiida.orm import CalcFunctionNode, CalcJobNode, WorkChainNode, WorkFunctionNode if processes and most_recent_node: @@ -288,20 +266,9 @@ def process_report(processes, most_recent_node, levelname, indent_size, max_dept @verdi_process.command('status') @options.MOST_RECENT_NODE() +@click.option('-c', '--call-link-label', 'call_link_label', is_flag=True, help='Include the call link label if set.') @click.option( - '-c', - '--call-link-label', - 'call_link_label', - is_flag=True, - help='Include the call link label if set.', -) -@click.option( - '-m', - '--max-depth', - 'max_depth', - type=int, - default=None, - help='Limit the number of levels to be printed.', + '-m', '--max-depth', 'max_depth', type=int, default=None, help='Limit the number of levels to be printed.' ) @arguments.PROCESSES() def process_status(call_link_label, most_recent_node, max_depth, processes): @@ -333,10 +300,7 @@ def process_kill(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage( - 'all', - 'cannot specify individual processes and the `--all` flag at the same time.', - ) + raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') if all_entries: click.confirm('Are you sure you want to kill all processes?', abort=True) @@ -344,13 +308,7 @@ def process_kill(processes, all_entries, timeout, wait): with capture_logging() as stream: try: message = 'Killed through `verdi process kill`' - control.kill_processes( - processes, - all_entries=all_entries, - timeout=timeout, - wait=wait, - message=message, - ) + control.kill_processes(processes, all_entries=all_entries, timeout=timeout, wait=wait, message=message) except control.ProcessTimeoutException as exception: echo.echo_critical(f'{exception}\n{REPAIR_INSTRUCTIONS}') @@ -369,21 +327,12 @@ def process_pause(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage( - 'all', - 'cannot specify individual processes and the `--all` flag at the same time.', - ) + raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') with capture_logging() as stream: try: message = 'Paused through `verdi process pause`' - control.pause_processes( - processes, - all_entries=all_entries, - timeout=timeout, - wait=wait, - message=message, - ) + control.pause_processes(processes, all_entries=all_entries, timeout=timeout, wait=wait, message=message) except control.ProcessTimeoutException as exception: echo.echo_critical(f'{exception}\n{REPAIR_INSTRUCTIONS}') @@ -402,10 +351,7 @@ def process_play(processes, all_entries, timeout, wait): from aiida.engine.processes import control if processes and all_entries: - raise click.BadOptionUsage( - 'all', - 'cannot specify individual processes and the `--all` flag at the same time.', - ) + raise click.BadOptionUsage('all', 'cannot specify individual processes and the `--all` flag at the same time.') with capture_logging() as stream: try: @@ -480,11 +426,7 @@ def process_repair(manager, broker, dry_run): daemon worker to complete it and will effectively be "stuck". Any process task that does not correspond to an active process is useless and should be discarded. Finally, duplicate process tasks are also problematic and are discarded. """ - from aiida.engine.processes.control import ( - get_active_processes, - get_process_tasks, - iterate_process_tasks, - ) + from aiida.engine.processes.control import get_active_processes, get_process_tasks, iterate_process_tasks active_processes = get_active_processes(project='id') process_tasks = get_process_tasks(broker) @@ -544,7 +486,7 @@ def process_repair(manager, broker, dry_run): @verdi_process.command('dump') @arguments.PROCESS() @options.PATH() -@options.NO_NODE_INPUTS() +@options.INCLUDE_INPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() @options.USE_PRESUBMIT() @@ -552,7 +494,7 @@ def process_repair(manager, broker, dry_run): def dump( process, path, - no_node_inputs, + include_inputs, include_attributes, include_extras, use_presubmit, @@ -589,10 +531,11 @@ def dump( ) # Generate default parent folder - if str(path) == '.': + if path is None: output_path = generate_default_dump_path(process_node=process) + else: + output_path = path.resolve() - # Instantiate YamlDumper processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) # Allow the command to be run with `CalcJob`s and `WorkChain`s @@ -602,7 +545,7 @@ def dump( process_dumped = calcjob_dump( calcjob_node=process, output_path=output_path, - no_node_inputs=no_node_inputs, + no_node_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=processnode_dumper, overwrite=overwrite, @@ -612,7 +555,7 @@ def dump( process_dumped = workchain_dump( process_node=process, output_path=output_path, - no_node_inputs=no_node_inputs, + no_node_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=processnode_dumper, overwrite=overwrite, @@ -629,4 +572,4 @@ def dump( f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' ) else: - echo.echo_report(f'Problem dumping {process.__class__.__name__} <{process.pk}>.') + echo.echo_critical(f'Problem dumping {process.__class__.__name__} <{process.pk}>.') diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 4bb5beb4b9..f6069a7eeb 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -69,6 +69,7 @@ 'HOSTNAME', 'IDENTIFIER', 'INCLUDE_ATTRIBUTES', + 'INCLUDE_INPUTS', 'INCLUDE_EXTRAS', 'INPUT_FORMAT', 'INPUT_PLUGIN', @@ -77,7 +78,6 @@ 'MOST_RECENT_NODE', 'NODE', 'NODES', - 'NO_NODE_INPUTS', 'NON_INTERACTIVE', 'OLDER_THAN', 'ORDER_BY', @@ -756,18 +756,15 @@ def set_log_level(_ctx, _param, value): '-p', '--path', type=click.Path(path_type=pathlib.Path), - default=pathlib.Path(), show_default=False, help='The directory in which the dumping folder will be created.', ) -NO_NODE_INPUTS = OverridableOption( - '--no-node-inputs', - '-n', - is_flag=True, - default=False, +INCLUDE_INPUTS = OverridableOption( + '--include-inputs/--exclude-inputs', + default=True, show_default=True, - help='Turn off dumping of the input nodes of the `CalcJob`(s).', + help='Include the input nodes of the `CalcJob`(s).', ) INCLUDE_ATTRIBUTES = OverridableOption( diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 3c19c08111..828e6d7424 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -113,7 +113,6 @@ def dump_yaml( if node_extras: node_dict['Node extras'] = node_extras - # Dump to file output_file = output_path.resolve() / output_filename with open(output_file, 'w') as handle: yaml.dump(node_dict, handle, sort_keys=False) @@ -124,9 +123,7 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): """Generate README file in main dumping directory. :param process_node: CalcJob or WorkChain Node. - :type process_node: ProcessNode :param output_path: Output path for dumping. - :type output_path: Path """ _readme_string = textwrap.dedent( @@ -154,8 +151,7 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): # TODO: Add outputs of `verdi process (status|report|show?)` - with open(output_path / 'README', 'w') as handle: - handle.write(_readme_string) + (output_path / 'README').write_text(_readme_string) def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Path: @@ -165,9 +161,7 @@ def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Pat creates the default parent folder for the dumping, if no name is given. :param process_node: The `ProcessNode` for which the directory is created. - :type process_node: WorkChainNode | CalcJobNode :return: The created parent dump path. - :rtype: Path """ try: @@ -177,11 +171,6 @@ def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Pat return Path(f'dump-{process_node.process_type}-{process_node.pk}') -# ? Could move code from `generate_calcjob_node_io` here to normal function -def attach_files_to_calcjob(): - pass - - # ? Could move this to `cmdline/utils` def validate_make_dump_path( path: Path = Path(), @@ -191,16 +180,14 @@ def validate_make_dump_path( Create default dumping directory for a given process node and return it as absolute path. :param path: The base path for the dump. Defaults to the current directory. - :type path: Path :return: The created dump path. - :rtype: Path """ import shutil output_path = path.resolve() # ? Use of `echo.echo_` only when running via `verdi`? -> I only see it used in the `cmd_` files. - if str(path) == '.' and overwrite: + if path is None and overwrite: echo.echo_critical('Path not set, defaults to CWD. Will not delete here for safety.') return output_path @@ -208,12 +195,11 @@ def validate_make_dump_path( if overwrite: # ? This might be a bit dangerous -> Check for it not being CWD enough? # ? Added check for README in folder to decrease chances of deleting some other path - if (output_path / 'README').is_file(): # Also check for presence of README for safety + if (output_path / 'README').is_file(): echo.echo_report(f'Overwrite set to true, will overwrite directory `{output_path}`.') shutil.rmtree(output_path) else: echo.echo_critical(f'Something went wrong. Manually remove existing `{output_path}` and dump again.') - # echo.echo_critical(f'No README present in "{output_path}" Manually remove and dump again.') else: echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') @@ -243,16 +229,13 @@ def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: if process_type is not None: node_label += f'-{process_type}' - # ? Add pk also to the sub-steps or only the parent dumping directory? - # node_label += f'-{node.pk}' - return node_label def calcjob_dump( calcjob_node: CalcJobNode, output_path: Path = Path(), - no_node_inputs: bool = False, + include_inputs: bool = False, use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, @@ -262,7 +245,7 @@ def calcjob_dump( :param calcjob_node: The CalcJobNode to be dumped. :param output_path: The path where the dumped contents will be stored. - :param no_node_inputs: If True, do not dump the inputs of the CalcJobNode. + :param include_inputs: If True, do not dump the inputs of the CalcJobNode. :param use_presubmit: If True, use the `prepare_for_submission` method to prepare the calculation for submission. If False, use the retrieved outputs and raw inputs. :return: None @@ -281,7 +264,7 @@ def calcjob_dump( except NotExistentAttributeError: pass - if not no_node_inputs: + if not include_inputs: calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path) else: @@ -309,7 +292,7 @@ def calcjob_dump( def workchain_dump( process_node: WorkChainNode | CalcJobNode, output_path: Path = Path(), - no_node_inputs: bool = False, + include_inputs: bool = False, use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, @@ -321,7 +304,7 @@ def workchain_dump( :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. :param output_path: The main output path where the directory tree will be created. - :param no_node_inputs: If True, do not include file or folder inputs in the dump. Defaults to False. + :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. :param use_presubmit: If True, use the `prepare_for_submission` method to get the inputs of the CalcJobNode. Defaults to False. :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new @@ -361,7 +344,7 @@ def workchain_dump( process_dumped = workchain_dump( process_node=child_node, output_path=output_path_child, - no_node_inputs=no_node_inputs, + include_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=node_dumper, ) @@ -371,7 +354,7 @@ def workchain_dump( process_dumped = calcjob_dump( calcjob_node=child_node, output_path=output_path_child, - no_node_inputs=no_node_inputs, + include_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=node_dumper, ) @@ -389,9 +372,7 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, input """Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. - :type calcjob_node: CalcJobNode :param output_path: The path where the inputs will be dumped. - :type output_path: Path """ dump_types = (SinglefileData, FolderData) @@ -405,7 +386,6 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, input if len(input_node_triple.node.base.repository.list_objects()) > 0 and isinstance( input_node_triple.node, dump_types ): - # input_node_path = output_path / Path('node_inputs') / Path(input_node_triple.link_label) input_node_path = output_path / inputs_relpath / Path(*input_node_triple.link_label.split('__')) input_node_triple.node.base.repository.copy_tree(input_node_path) @@ -415,9 +395,7 @@ def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): Dump inputs of a `CalcJobNode` using the `presubmit` function. :param process: The `CalcJobNode` whose inputs need to be dumped. - :type process: CalcJobNode :param output_path: The path where the inputs will be dumped. - :type output_path: Path """ builder_restart = calcjob_node.get_builder_restart() diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index ed3f47c4f0..9f2233e2ee 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -97,7 +97,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): clean_tmp_path(tmp_path=tmp_path) # Don't dump the connected node inputs - result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, no_node_inputs=True) + result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, include_inputs=True) assert not singlefiledata_outputfile.is_file() assert result From e369359a53a562dfe720d211e4b346bf93c78f5b Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 18 Apr 2024 22:50:55 +0200 Subject: [PATCH 06/30] Big refactor to make code more concise - Remove `verdi calcjob dump` endpoint (not sure about this, might add again) - Only call `process_dump` -> Adapted the function for this - Removed conditional in `generate_node_input_label` --- src/aiida/cmdline/commands/cmd_calcjob.py | 69 ------- src/aiida/cmdline/commands/cmd_process.py | 57 +++--- src/aiida/tools/dumping/processes.py | 220 ++++++++++++---------- tests/tools/dumping/test_processes.py | 10 +- 4 files changed, 147 insertions(+), 209 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_calcjob.py b/src/aiida/cmdline/commands/cmd_calcjob.py index 1d122caa5a..df49ffc358 100644 --- a/src/aiida/cmdline/commands/cmd_calcjob.py +++ b/src/aiida/cmdline/commands/cmd_calcjob.py @@ -348,72 +348,3 @@ def get_remote_and_path(calcjob, path=None): f'nor does its associated process class `{calcjob.process_class.__class__.__name__}`\n' 'Please specify a path explicitly.' ) - - -@verdi_calcjob.command('dump') -@arguments.CALCULATION('calcjob', type=CalculationParamType(sub_classes=('aiida.node:process.calculation.calcjob',))) -@options.PATH() -@options.INCLUDE_INPUTS() -@options.INCLUDE_ATTRIBUTES() -@options.INCLUDE_EXTRAS() -@options.USE_PRESUBMIT() -@options.OVERWRITE() -def dump( - calcjob, - path, - include_inputs, - include_attributes, - include_extras, - use_presubmit, - overwrite, -) -> None: - """Dump files involved in the execution of a `CalcJob`. - - By default, input and output files can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". - - When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in - principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the - job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also - note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run - separately. - - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA - node data for further inspection. - """ - - from aiida.tools.dumping.processes import ( - ProcessNodeYamlDumper, - calcjob_dump, - generate_default_dump_path, - make_dump_readme, - ) - - if path is None: - output_path = generate_default_dump_path(process_node=calcjob) - else: - output_path = path.resolve() - - calcjobnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - - calcjob_dumped = calcjob_dump( - calcjob_node=calcjob, - output_path=output_path, - include_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=calcjobnode_dumper, - overwrite=overwrite, - ) - - # Create README in parent directory - # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and - # workchain_dump files such that they can also be used from within the Python API, not just via verdi - make_dump_readme(output_path=output_path, process_node=calcjob) - - if calcjob_dumped: - echo.echo_success( - f'Raw files for {calcjob.__class__.__name__} <{calcjob.pk}> dumped in `{output_path}`.' - ) - else: - echo.echo_report(f'Problem dumping {calcjob.__class__.__name__} <{calcjob.pk}>.') diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index f3f6794d91..eecb1815c1 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -502,10 +502,10 @@ def dump( ) -> None: """Dump files involved in the execution of a process. - Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are - contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus - mirrors the logical execution of the workflow, which can also be queried by running `verdi process status - ` on the command line. + Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow + are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus + mirrors the logical execution of the workflow, which can also be queried by running `verdi process status ` on + the command line. By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution @@ -521,55 +521,46 @@ def dump( node data for further inspection. """ - from aiida.orm import CalcJobNode from aiida.tools.dumping.processes import ( ProcessNodeYamlDumper, - calcjob_dump, generate_default_dump_path, make_dump_readme, - workchain_dump, + process_dump, + validate_make_dump_path, ) # Generate default parent folder if path is None: output_path = generate_default_dump_path(process_node=process) - else: - output_path = path.resolve() - - processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - # Allow the command to be run with `CalcJob`s and `WorkChain`s - if isinstance(process, CalcJobNode): - echo.echo_warning('Command called on `CalcJob`. Will dump, but you can also use `verdi calcjob dump` instead.') + # Capture `FileExistsError` here already, not by trying to run the dumping + try: + validate_make_dump_path(path=output_path, overwrite=overwrite) + except FileExistsError: + echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') - process_dumped = calcjob_dump( - calcjob_node=process, - output_path=output_path, - no_node_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=processnode_dumper, - overwrite=overwrite, - ) + processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - else: - process_dumped = workchain_dump( + try: + process_dump( process_node=process, output_path=output_path, - no_node_inputs=include_inputs, + include_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=processnode_dumper, overwrite=overwrite, ) - # Create README in parent directory - # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and - # workchain_dump files such that they can also be used from within the Python API, not just via verdi - make_dump_readme(output_path=output_path, process_node=process) - - # Communicate success/failure of dumping - if process_dumped: echo.echo_success( f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' ) - else: + + # ? Which exceptions do I expect here? So far only FileExistsError + except Exception: + # raise echo.echo_critical(f'Problem dumping {process.__class__.__name__} <{process.pk}>.') + + # Create README in parent directory + # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and + # process_dump files such that they can also be used from within the Python API, not just via verdi + make_dump_readme(output_path=output_path, process_node=process) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 828e6d7424..367d1534a2 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -10,6 +10,7 @@ from __future__ import annotations +import logging import textwrap from pathlib import Path @@ -26,15 +27,15 @@ from aiida.orm import ( CalcFunctionNode, CalcJobNode, - FolderData, ProcessNode, - SinglefileData, WorkChainNode, WorkFunctionNode, ) from aiida.orm.utils import LinkTriple from aiida.transports.plugins.local import LocalTransport +_LOGGER = logging.getLogger(__name__) + class ProcessNodeYamlDumper: """Utility class to dump selected `ProcessNode` properties and, optionally, attributes and extras to yaml.""" @@ -157,7 +158,7 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. - This function is not called for the sub-calls of `calcjob_dump` or during the recursive `workchain_dump` as it just + This function is not called for the sub-calls of `calcjob_dump` or during the recursive `process_dump` as it just creates the default parent folder for the dumping, if no name is given. :param process_node: The `ProcessNode` for which the directory is created. @@ -173,7 +174,7 @@ def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Pat # ? Could move this to `cmdline/utils` def validate_make_dump_path( - path: Path = Path(), + path: Path, overwrite: bool = False, ) -> Path: """ @@ -184,28 +185,38 @@ def validate_make_dump_path( """ import shutil - output_path = path.resolve() - # ? Use of `echo.echo_` only when running via `verdi`? -> I only see it used in the `cmd_` files. - if path is None and overwrite: - echo.echo_critical('Path not set, defaults to CWD. Will not delete here for safety.') - return output_path - - if output_path.is_dir() and any(output_path.iterdir()): - if overwrite: - # ? This might be a bit dangerous -> Check for it not being CWD enough? - # ? Added check for README in folder to decrease chances of deleting some other path - if (output_path / 'README').is_file(): - echo.echo_report(f'Overwrite set to true, will overwrite directory `{output_path}`.') - shutil.rmtree(output_path) - else: - echo.echo_critical(f'Something went wrong. Manually remove existing `{output_path}` and dump again.') + if path is None: + raise ValueError('Path not set.') + + if path.is_dir(): + + # Existing, but empty directory => OK + if not any(path.iterdir()): + pass + + # Existing and non-empty directory and overwrite False => FileExistsError + elif not overwrite: + raise FileExistsError(f'Path `{path}` already exists and overwrite set to False.') + + # Existing and non-empty directory and overwrite True => Check for '.aiida_node_metadata.yaml' for safety + # '.aiida_node_metadata.yaml' present => Remove directory + elif (path / '.aiida_node_metadata.yaml').is_file(): + _LOGGER.info(f'Overwrite set to true, will overwrite directory `{path}`.') + shutil.rmtree(path) + path.mkdir(parents=True, exist_ok=False) + + # Existing and non-empty directory and overwrite True => Check for README for safety + # '.aiida_node_metadata.yaml' absent => Remove directory else: - echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') + _LOGGER.critical( + f"`{path}` Path exists but no `.aiida_node_metadata.yaml` found. Won't delete for security.\n" + # f'Manually remove existing `{path}` and dump again.' + ) - output_path.mkdir(parents=True, exist_ok=False) + path.mkdir(exist_ok=True, parents=True) - return output_path + return path.resolve() def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: @@ -214,19 +225,16 @@ def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: link_label = link_triple.link_label # Generate directories with naming scheme akin to `verdi process status` - if link_label != 'CALL' and not link_label.startswith('iteration_'): - node_label = f'{index:02d}-{link_label}' - else: - node_label = f'{index:02d}' + node_label = f'{index:02d}-{link_label}' try: process_label = node.process_label - if process_label is not None: + if process_label is not None and process_label != link_label: node_label += f'-{process_label}' except AttributeError: process_type = node.process_type - if process_type is not None: + if process_type is not None and process_type != link_label: node_label += f'-{process_type}' return node_label @@ -234,8 +242,8 @@ def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: def calcjob_dump( calcjob_node: CalcJobNode, - output_path: Path = Path(), - include_inputs: bool = False, + output_path: Path, + include_inputs: bool = True, use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, @@ -251,23 +259,26 @@ def calcjob_dump( :return: None """ - output_path = validate_make_dump_path(path=output_path, overwrite=overwrite) + if output_path is None: + output_path = generate_default_dump_path(process_node=calcjob_node) + + validate_make_dump_path(path=output_path, overwrite=overwrite) if not use_presubmit: - # ? Outputs obtained via retrieved and should not be present when using `prepare_for_submission` as it puts the - # ? calculation in a state to be submitted ?! - calcjob_node.base.repository.copy_tree(output_path / Path('raw_inputs')) + calcjob_node.base.repository.copy_tree(output_path.resolve() / 'raw_inputs') try: - calcjob_node.outputs.retrieved.copy_tree(output_path / Path('raw_outputs')) + calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / 'raw_outputs') # Might not have an output with link label `retrieved` except NotExistentAttributeError: pass - if not include_inputs: + if include_inputs: calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path) else: + # ? Outputs obtained via retrieved and should not be present when using `prepare_for_submission` as it puts the + # ? calculation in a state to be submitted ?! try: calcjob_presubmit_dump(calcjob_node=calcjob_node, output_path=output_path) except ValueError: @@ -286,13 +297,11 @@ def calcjob_dump( node_dumper = ProcessNodeYamlDumper() node_dumper.dump_yaml(process_node=calcjob_node, output_path=output_path) - return True - -def workchain_dump( +def process_dump( process_node: WorkChainNode | CalcJobNode, - output_path: Path = Path(), - include_inputs: bool = False, + output_path: Path, + include_inputs: bool = True, use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, @@ -312,59 +321,69 @@ def workchain_dump( :return: bool """ - # Keep track of dumping success to be able to communicate it outwards - all_processes_dumped = True - # ? Realized during testing: If no path provided, only for the sub-workchain an additional directory is created, but # ? should also create one here, if the function is imported normally and used in Python scripts - output_path = validate_make_dump_path(path=output_path, overwrite=overwrite) + if output_path is None: + output_path = generate_default_dump_path(process_node=process_node) + + validate_make_dump_path(path=output_path, overwrite=overwrite) # This will eventually be replaced once pydantic backend PR merged if node_dumper is None: node_dumper = ProcessNodeYamlDumper() - node_dumper.dump_yaml(process_node=process_node, output_path=output_path.resolve()) - - # Don't increment index for `ProcessNodes` that don't (always?) have file IO - # (`CalcFunctionNodes`/`WorkFunctionNodes`), such as `create_kpoints_from_distance` - called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() - called_links = [ - called_link - for called_link in called_links - if not isinstance(called_link.node, (CalcFunctionNode, WorkFunctionNode)) - ] - - for index, link_triple in enumerate(sorted(called_links, key=lambda link_triple: link_triple.node.ctime), start=1): - child_node = link_triple.node - child_label = generate_node_input_label(index=index, link_triple=link_triple) - - output_path_child = output_path.resolve() / child_label - - # Recursive function call for `WorkChainNode`` - if isinstance(child_node, WorkChainNode): - process_dumped = workchain_dump( - process_node=child_node, - output_path=output_path_child, - include_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=node_dumper, - ) - # Dump for `CalcJobNode` - elif isinstance(child_node, CalcJobNode): - process_dumped = calcjob_dump( - calcjob_node=child_node, - output_path=output_path_child, - include_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=node_dumper, - ) - - else: - process_dumped = False - - all_processes_dumped = all_processes_dumped and process_dumped + # Need to dump for `WorkChainNode`s, as well, otherwise only `CalcJobNode`s + node_dumper.dump_yaml(process_node=process_node, output_path=output_path.resolve()) - return all_processes_dumped + # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive + # `process_dump` function which is the only one called from `verdi`, then I need to dump for a `CalcJob` here, as + # well. Also, if I want to be able to use `process_dump` from within the Python API + if isinstance(process_node, CalcJobNode): + calcjob_dump( + calcjob_node=process_node, + output_path=output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite, + ) + + # Recursive call for WorkChainNode + elif isinstance(process_node, WorkChainNode): + + # Don't increment index for `ProcessNodes` that don't (always?) have file IO + # (`CalcFunctionNodes`/`WorkFunctionNodes`), such as `create_kpoints_from_distance` + called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + + for index, link_triple in enumerate( + sorted(called_links, key=lambda link_triple: link_triple.node.ctime), start=1 + ): + child_node = link_triple.node + child_label = generate_node_input_label(index=index, link_triple=link_triple) + child_output_path = output_path.resolve() / child_label + + # Recursive function call for `WorkChainNode`` + # Not sure if the next two cases works for `WorkFunction` and `CalcFuncion``Node`s + if isinstance(child_node, (WorkChainNode, WorkFunctionNode)): + + process_dump( + process_node=child_node, + output_path=child_output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite + ) + + elif isinstance(child_node, (CalcJobNode, CalcFunctionNode)): + calcjob_dump( + calcjob_node=child_node, + output_path=child_output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite + ) # Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs @@ -374,7 +393,6 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, input :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. :param output_path: The path where the inputs will be dumped. """ - dump_types = (SinglefileData, FolderData) # ? Not using the `node_class` argument of `get_incoming`, as it does not actually retrieve, e.g. a `UpfData` node # ? (due to planned deprecation?) @@ -382,12 +400,10 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, input input_node_triples = calcjob_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) for input_node_triple in input_node_triples: - # Select only repositories that hold objects and are of the selected dump_types - if len(input_node_triple.node.base.repository.list_objects()) > 0 and isinstance( - input_node_triple.node, dump_types - ): + # Select only repositories that actually hold objects + if len(input_node_triple.node.base.repository.list_objects()) > 0: input_node_path = output_path / inputs_relpath / Path(*input_node_triple.link_label.split('__')) - input_node_triple.node.base.repository.copy_tree(input_node_path) + input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): @@ -418,13 +434,13 @@ def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): # This happens if `local_copy_list` is empty pass - local_transport = LocalTransport().open() - new_calcjob_node: CalcJobNode = calcjob_process.node - upload_calculation( - node=new_calcjob_node, - transport=local_transport, - calc_info=calc_info, - folder=Folder(abspath=output_path), - inputs=calcjob_process.inputs, - dry_run=False, - ) + with LocalTransport() as transport: + new_calcjob_node: CalcJobNode = calcjob_process.node + upload_calculation( + node=new_calcjob_node, + transport=transport, + calc_info=calc_info, + folder=Folder(abspath=output_path), + inputs=calcjob_process.inputs, + dry_run=False, + ) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 9f2233e2ee..6fc27cfa19 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -22,7 +22,7 @@ calcjob_node_inputs_dump, generate_default_dump_path, generate_node_input_label, - workchain_dump, + process_dump, ) filename = 'file.txt' @@ -141,7 +141,7 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme assert not Path(dump_path / 'aiida.out').is_file() -def test_workchain_dump_io(generate_work_chain_io, tmp_path): +def test_process_dump_io(generate_work_chain_io, tmp_path): wc_node = generate_work_chain_io() dump_parent_path = tmp_path / 'wc-dump-test' @@ -161,20 +161,20 @@ def test_workchain_dump_io(generate_work_chain_io, tmp_path): # Here, when setting `output_path=tmp_path`, no parent directory for the parent workchain is created # Therefore, go into tmp-directory used for testing, without specifying output path -> Closer to how people might # actually use the function - result = workchain_dump(process_node=wc_node, output_path=dump_parent_path) + result = process_dump(process_node=wc_node, output_path=dump_parent_path) assert result assert all([expected_file.is_file() for expected_file in expected_files]) -def test_workchain_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): +def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): # Still set directory fixed to make dump directory reproducible (it should be anyway, but contains e.g. the pk) dump_parent_path = tmp_path / 'multiply_add-dump-test' # Now test for output from running MultiplyAddWorkChain multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - result = workchain_dump(process_node=multiply_add_node, output_path=dump_parent_path) + result = process_dump(process_node=multiply_add_node, output_path=dump_parent_path) assert result raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] From f5adf175866d3c98cd9f26858f17deb19e144b3b Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 22 Apr 2024 17:40:38 +0200 Subject: [PATCH 07/30] Add first version of `--flat` option. This option allows dumping of a CalcJob (or of a simple WorkChain that only calls a single CalcJob) in a flat directory, without creating the hierarchy of `raw_inputs`, `raw_outputs`, and `node_inputs`. This might be useful for cases where AiiDA is only used to run a calculation and dump the results in a specific custom path, where the custom path is dictated by the other code that calls AiiDA to submit calculations (e.g. what we are currently working on aiida-koopmans). --- docs/source/reference/command_line.rst | 1 - src/aiida/cmdline/commands/cmd_process.py | 15 +++- src/aiida/tools/dumping/processes.py | 89 ++++++++++++++++------- 3 files changed, 73 insertions(+), 32 deletions(-) diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 6dafac22c3..dc9187695c 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -48,7 +48,6 @@ Below is a list with all available subcommands. Commands: cleanworkdir Clean all content of all output remote folders of calcjobs. - dump Dump files involved in the execution of a `CalcJob`. gotocomputer Open a shell in the remote folder on the calcjob. inputcat Show the contents of one of the calcjob input files. inputls Show the list of the generated calcjob input files. diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index eecb1815c1..80e25d8415 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -491,6 +491,7 @@ def process_repair(manager, broker, dry_run): @options.INCLUDE_EXTRAS() @options.USE_PRESUBMIT() @options.OVERWRITE() +@click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') def dump( process, path, @@ -499,6 +500,7 @@ def dump( include_extras, use_presubmit, overwrite, + flat, ) -> None: """Dump files involved in the execution of a process. @@ -549,16 +551,21 @@ def dump( use_presubmit=use_presubmit, node_dumper=processnode_dumper, overwrite=overwrite, + flat=flat, ) echo.echo_success( f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' ) - # ? Which exceptions do I expect here? So far only FileExistsError - except Exception: - # raise - echo.echo_critical(f'Problem dumping {process.__class__.__name__} <{process.pk}>.') + # ? Which exceptions do I expect here? + # except FileExistsError: + # # raise + # echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') + except NotImplementedError: + echo.echo_critical('flat dumping not supported for `WorkChains` that call more than one `CalcJob`.') + except Exception as e: + echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') # Create README in parent directory # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 367d1534a2..418e0d0e61 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -190,7 +190,6 @@ def validate_make_dump_path( raise ValueError('Path not set.') if path.is_dir(): - # Existing, but empty directory => OK if not any(path.iterdir()): pass @@ -247,6 +246,7 @@ def calcjob_dump( use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, + flat: bool = False, ) -> bool: """ Dump the contents of a CalcJobNode to a specified output path. @@ -264,19 +264,27 @@ def calcjob_dump( validate_make_dump_path(path=output_path, overwrite=overwrite) + if not flat: + default_dump_paths = ('raw_inputs', 'raw_outputs', 'node_inputs') + else: + default_dump_paths = ('', '', '') + if not use_presubmit: - calcjob_node.base.repository.copy_tree(output_path.resolve() / 'raw_inputs') + calcjob_node.base.repository.copy_tree(output_path.resolve() / default_dump_paths[0]) try: - calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / 'raw_outputs') + calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / default_dump_paths[1]) # Might not have an output with link label `retrieved` except NotExistentAttributeError: pass if include_inputs: - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path) + calcjob_node_inputs_dump( + calcjob_node=calcjob_node, output_path=output_path / default_dump_paths[2], flat=flat + ) else: + pass # ? Outputs obtained via retrieved and should not be present when using `prepare_for_submission` as it puts the # ? calculation in a state to be submitted ?! try: @@ -305,6 +313,7 @@ def process_dump( use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, + flat: bool = False, ) -> bool: """Dumps all data involved in a `WorkChainNode`, including its outgoing links. @@ -339,41 +348,62 @@ def process_dump( # `process_dump` function which is the only one called from `verdi`, then I need to dump for a `CalcJob` here, as # well. Also, if I want to be able to use `process_dump` from within the Python API if isinstance(process_node, CalcJobNode): - calcjob_dump( - calcjob_node=process_node, - output_path=output_path, - include_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=node_dumper, - overwrite=overwrite, - ) + if not flat: + calcjob_dump( + calcjob_node=process_node, + output_path=output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite, + ) + else: + calcjob_dump( + calcjob_node=process_node, + output_path=output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite, + flat=flat, + ) # Recursive call for WorkChainNode elif isinstance(process_node, WorkChainNode): - # Don't increment index for `ProcessNodes` that don't (always?) have file IO # (`CalcFunctionNodes`/`WorkFunctionNodes`), such as `create_kpoints_from_distance` called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + # todo: Add check if flat is True, and multiple sub-workchains, that it raises exception. Otherwise, if + # todo: only one top-level workchain, dump everything from the single calcjob that was called in the + # todo: workchain in the main workchain directory + called_descendants = process_node.called_descendants + if flat and [isinstance(node, CalcJobNode) for node in called_descendants].count(True) > 1: + raise NotImplementedError + for index, link_triple in enumerate( sorted(called_links, key=lambda link_triple: link_triple.node.ctime), start=1 ): child_node = link_triple.node - child_label = generate_node_input_label(index=index, link_triple=link_triple) + if not flat: + child_label = generate_node_input_label(index=index, link_triple=link_triple) + else: + child_label = '' child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkChainNode`` - # Not sure if the next two cases works for `WorkFunction` and `CalcFuncion``Node`s + # Not sure if the next two cases work for `WorkFunction` and `CalcFuncion``Node`s if isinstance(child_node, (WorkChainNode, WorkFunctionNode)): - - process_dump( - process_node=child_node, - output_path=child_output_path, - include_inputs=include_inputs, - use_presubmit=use_presubmit, - node_dumper=node_dumper, - overwrite=overwrite - ) + if flat: + process_dump( + process_node=child_node, + output_path=child_output_path, + include_inputs=include_inputs, + use_presubmit=use_presubmit, + node_dumper=node_dumper, + overwrite=overwrite, + flat=flat, + ) elif isinstance(child_node, (CalcJobNode, CalcFunctionNode)): calcjob_dump( @@ -382,12 +412,13 @@ def process_dump( include_inputs=include_inputs, use_presubmit=use_presubmit, node_dumper=node_dumper, - overwrite=overwrite + overwrite=overwrite, + flat=flat, ) # Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs -def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, inputs_relpath: Path = Path('node_inputs')): +def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, flat: bool = False): """Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. @@ -402,7 +433,11 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, input for input_node_triple in input_node_triples: # Select only repositories that actually hold objects if len(input_node_triple.node.base.repository.list_objects()) > 0: - input_node_path = output_path / inputs_relpath / Path(*input_node_triple.link_label.split('__')) + if not flat: + input_node_path = output_path / Path(*input_node_triple.link_label.split('__')) + else: + input_node_path = output_path + input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) From 10754169f4805e818380da0720b1a948a018c77a Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 22 Apr 2024 17:51:47 +0200 Subject: [PATCH 08/30] Shelfed `--use-presubmit` option --- src/aiida/cmdline/commands/cmd_process.py | 9 ---- src/aiida/cmdline/params/options/main.py | 11 ----- src/aiida/tools/dumping/processes.py | 54 +++++------------------ tests/tools/dumping/test_processes.py | 18 -------- 4 files changed, 10 insertions(+), 82 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 80e25d8415..5a5af7426a 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -489,7 +489,6 @@ def process_repair(manager, broker, dry_run): @options.INCLUDE_INPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() -@options.USE_PRESUBMIT() @options.OVERWRITE() @click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') def dump( @@ -498,7 +497,6 @@ def dump( include_inputs, include_attributes, include_extras, - use_presubmit, overwrite, flat, ) -> None: @@ -513,12 +511,6 @@ def dump( "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". - When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in - principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the - job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also - note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run - separately. - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection. """ @@ -548,7 +540,6 @@ def dump( process_node=process, output_path=output_path, include_inputs=include_inputs, - use_presubmit=use_presubmit, node_dumper=processnode_dumper, overwrite=overwrite, flat=flat, diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index f6069a7eeb..6b9fb4b516 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -109,7 +109,6 @@ 'USER_FIRST_NAME', 'USER_INSTITUTION', 'USER_LAST_NAME', - 'USE_PRESUBMIT', 'VERBOSITY', 'VISUALIZATION_FORMAT', 'WAIT', @@ -785,16 +784,6 @@ def set_log_level(_ctx, _param, value): help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', ) -USE_PRESUBMIT = OverridableOption( - '--use-presubmit', - '-u', - is_flag=True, - default=False, - show_default=True, - help="""Use the `presubmit` method for dumping the files of the`CalcJob`. Note: this requires the corresponding - aiida-plugin to be installed.""", -) - OVERWRITE = OverridableOption( '--overwrite', '-o', diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 418e0d0e61..51178eba64 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -140,12 +140,6 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". - When using the `--use-presubmit` command line option, the folder created for each individual simulation should, in - principle, allow for direct resubmission, as it mirrors the (remote) folder that was created by AiiDA to execute the - job. However, this option requires the relevant AiiDA plugin to be installed, so it is disabled by default. Also - note that intermediate files might be missing, so for a multi-step workflow, each step would still have to be run - separately. - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection.""" # noqa: E501 ) @@ -243,7 +237,6 @@ def calcjob_dump( calcjob_node: CalcJobNode, output_path: Path, include_inputs: bool = True, - use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, flat: bool = False, @@ -254,8 +247,6 @@ def calcjob_dump( :param calcjob_node: The CalcJobNode to be dumped. :param output_path: The path where the dumped contents will be stored. :param include_inputs: If True, do not dump the inputs of the CalcJobNode. - :param use_presubmit: If True, use the `prepare_for_submission` method to prepare the calculation for - submission. If False, use the retrieved outputs and raw inputs. :return: None """ @@ -269,36 +260,18 @@ def calcjob_dump( else: default_dump_paths = ('', '', '') - if not use_presubmit: - calcjob_node.base.repository.copy_tree(output_path.resolve() / default_dump_paths[0]) - try: - calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / default_dump_paths[1]) - - # Might not have an output with link label `retrieved` - except NotExistentAttributeError: - pass - - if include_inputs: - calcjob_node_inputs_dump( - calcjob_node=calcjob_node, output_path=output_path / default_dump_paths[2], flat=flat - ) + calcjob_node.base.repository.copy_tree(output_path.resolve() / default_dump_paths[0]) - else: + try: + calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / default_dump_paths[1]) + except NotExistentAttributeError: + # Might not have an output with link label `retrieved` pass - # ? Outputs obtained via retrieved and should not be present when using `prepare_for_submission` as it puts the - # ? calculation in a state to be submitted ?! - try: - calcjob_presubmit_dump(calcjob_node=calcjob_node, output_path=output_path) - except ValueError: - # raise - # missing_plugin = str(calcjob_node.process_class).split(' ')[1].split('.')[0][1:] -> .process_class leads - # to exception without plugin installed - missing_plugin = f'aiida-{calcjob_node.process_type.split(':')[1].split('.')[0]}' - echo.echo_error( - f'Error when trying to get a restart-builder. Do you have the relevant ' - f'plugin `{missing_plugin}` installed?' - ) - return False + + if include_inputs: + calcjob_node_inputs_dump( + calcjob_node=calcjob_node, output_path=output_path / default_dump_paths[2], flat=flat + ) # This will eventually be replaced once pydantic backend PR merged if node_dumper is None: @@ -310,7 +283,6 @@ def process_dump( process_node: WorkChainNode | CalcJobNode, output_path: Path, include_inputs: bool = True, - use_presubmit: bool = False, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, flat: bool = False, @@ -323,8 +295,6 @@ def process_dump( :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. :param output_path: The main output path where the directory tree will be created. :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. - :param use_presubmit: If True, use the `prepare_for_submission` method to get the inputs of the - CalcJobNode. Defaults to False. :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new instance will be created. Defaults to None. :return: bool @@ -353,7 +323,6 @@ def process_dump( calcjob_node=process_node, output_path=output_path, include_inputs=include_inputs, - use_presubmit=use_presubmit, node_dumper=node_dumper, overwrite=overwrite, ) @@ -362,7 +331,6 @@ def process_dump( calcjob_node=process_node, output_path=output_path, include_inputs=include_inputs, - use_presubmit=use_presubmit, node_dumper=node_dumper, overwrite=overwrite, flat=flat, @@ -399,7 +367,6 @@ def process_dump( process_node=child_node, output_path=child_output_path, include_inputs=include_inputs, - use_presubmit=use_presubmit, node_dumper=node_dumper, overwrite=overwrite, flat=flat, @@ -410,7 +377,6 @@ def process_dump( calcjob_node=child_node, output_path=child_output_path, include_inputs=include_inputs, - use_presubmit=use_presubmit, node_dumper=node_dumper, overwrite=overwrite, flat=flat, diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 6fc27cfa19..0e6823cdf0 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -101,16 +101,6 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): assert not singlefiledata_outputfile.is_file() assert result - clean_tmp_path(tmp_path=tmp_path) - - # use_presubmit -> Depends on implementation from aiida-plugin, so I cannot test specifically inside aiida-core - # Assert that `False` is returned if `use_presubmit` used, but no `process_type` has been set. This is the test case - # one gets from the fixture (f'no process type for Node<{self.pk}>: cannot recreate process class') - result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, use_presubmit=True) - assert result is False - - clean_tmp_path(tmp_path=tmp_path) - def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): dump_path = tmp_path / 'calcjob_dump_arithmetic_add' @@ -132,14 +122,6 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme clean_tmp_path(tmp_path=tmp_path) - # Dumping with `use_presubmit` -> Directory structure is different and output file not dumped - result = calcjob_dump(calcjob_node=add_node, output_path=dump_path, use_presubmit=True) - - assert result - assert Path(dump_path / '_aiidasubmit.sh').is_file() - assert Path(dump_path / 'aiida.in').is_file() - assert not Path(dump_path / 'aiida.out').is_file() - def test_process_dump_io(generate_work_chain_io, tmp_path): wc_node = generate_work_chain_io() From 671538c12dc245f4dbe81f6878466f02562acddc Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Wed, 24 Apr 2024 02:15:39 +0200 Subject: [PATCH 09/30] Updated tests for calcjob_dumps after code changes Different calcjob_dump io functions run through. Also added tests for the flat option for these functions. Still need to update tests for arithmetic_add, and process_dump functions, both for io and for multiply_add. --- src/aiida/cmdline/commands/cmd_process.py | 2 +- src/aiida/tools/dumping/processes.py | 132 ++++++++-------------- tests/tools/dumping/test_processes.py | 129 +++++++++++++++------ 3 files changed, 140 insertions(+), 123 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 5a5af7426a..45a0e773c5 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -561,4 +561,4 @@ def dump( # Create README in parent directory # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and # process_dump files such that they can also be used from within the Python API, not just via verdi - make_dump_readme(output_path=output_path, process_node=process) + make_dump_readme(output_path=output_path, process_node=process) \ No newline at end of file diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 51178eba64..f49187bdb8 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -16,14 +16,8 @@ import yaml -from aiida.cmdline.utils import echo from aiida.common import LinkType from aiida.common.exceptions import NotExistentAttributeError -from aiida.common.folders import Folder -from aiida.engine.daemon.execmanager import upload_calculation -from aiida.engine.processes.calcjobs import CalcJob -from aiida.engine.utils import instantiate_process -from aiida.manage import get_manager from aiida.orm import ( CalcFunctionNode, CalcJobNode, @@ -32,7 +26,6 @@ WorkFunctionNode, ) from aiida.orm.utils import LinkTriple -from aiida.transports.plugins.local import LocalTransport _LOGGER = logging.getLogger(__name__) @@ -149,7 +142,7 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): (output_path / 'README').write_text(_readme_string) -def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode) -> Path: +def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode | CalcFunctionNode | WorkFunctionNode) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. This function is not called for the sub-calls of `calcjob_dump` or during the recursive `process_dump` as it just @@ -234,13 +227,14 @@ def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: def calcjob_dump( - calcjob_node: CalcJobNode, - output_path: Path, + calcjob_node: CalcJobNode | CalcFunctionNode, + output_path: Path | None, include_inputs: bool = True, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, flat: bool = False, -) -> bool: + io_dump_paths: list | None = None, +): """ Dump the contents of a CalcJobNode to a specified output path. @@ -253,25 +247,40 @@ def calcjob_dump( if output_path is None: output_path = generate_default_dump_path(process_node=calcjob_node) - validate_make_dump_path(path=output_path, overwrite=overwrite) - - if not flat: - default_dump_paths = ('raw_inputs', 'raw_outputs', 'node_inputs') - else: - default_dump_paths = ('', '', '') + try: + validate_make_dump_path(path=output_path, overwrite=overwrite) + except: + raise + + default_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] + + if flat and io_dump_paths is None: + io_dump_paths = ['', '', ''] + _LOGGER.info('Flat set to True and no `io_dump_paths`. Dump in a flat directory, files might be overwritten.') + # raise ValueError('Flat set to False but no io_dump_paths provided.') + # -> Can still provide paths but use flat=True to not flatten the node_inputs -> Probably this is bad design... + elif flat and io_dump_paths is not None: + _LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.') + elif not flat and io_dump_paths is None: + _LOGGER.info( + f'Flat set to False but no `io_dump_paths` provided. Will use the defaults: {default_io_dump_paths}.' + ) + io_dump_paths = default_io_dump_paths + elif not flat and io_dump_paths is not None: + _LOGGER.info( + 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' + ) - calcjob_node.base.repository.copy_tree(output_path.resolve() / default_dump_paths[0]) + calcjob_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) try: - calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / default_dump_paths[1]) + calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / io_dump_paths[1]) except NotExistentAttributeError: - # Might not have an output with link label `retrieved` + # Might not have an output with link label `retrieved` pass if include_inputs: - calcjob_node_inputs_dump( - calcjob_node=calcjob_node, output_path=output_path / default_dump_paths[2], flat=flat - ) + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path / io_dump_paths[2], flat=flat) # This will eventually be replaced once pydantic backend PR merged if node_dumper is None: @@ -280,13 +289,13 @@ def calcjob_dump( def process_dump( - process_node: WorkChainNode | CalcJobNode, - output_path: Path, + process_node: WorkChainNode | CalcJobNode | WorkFunctionNode | CalcFunctionNode, + output_path: Path | None, include_inputs: bool = True, node_dumper: ProcessNodeYamlDumper | None = None, overwrite: bool = True, flat: bool = False, -) -> bool: +): """Dumps all data involved in a `WorkChainNode`, including its outgoing links. Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are @@ -318,23 +327,14 @@ def process_dump( # `process_dump` function which is the only one called from `verdi`, then I need to dump for a `CalcJob` here, as # well. Also, if I want to be able to use `process_dump` from within the Python API if isinstance(process_node, CalcJobNode): - if not flat: - calcjob_dump( - calcjob_node=process_node, - output_path=output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - ) - else: - calcjob_dump( - calcjob_node=process_node, - output_path=output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - flat=flat, - ) + calcjob_dump( + calcjob_node=process_node, + output_path=output_path, + include_inputs=include_inputs, + node_dumper=node_dumper, + overwrite=overwrite, + flat=flat, + ) # Recursive call for WorkChainNode elif isinstance(process_node, WorkChainNode): @@ -384,16 +384,13 @@ def process_dump( # Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs -def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, flat: bool = False): +def calcjob_node_inputs_dump(calcjob_node: CalcJobNode | CalcFunctionNode, output_path: Path, flat: bool = False): """Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. :param output_path: The path where the inputs will be dumped. """ - # ? Not using the `node_class` argument of `get_incoming`, as it does not actually retrieve, e.g. a `UpfData` node - # ? (due to planned deprecation?) - # ? Instead, check for isinstance of `SinglefileData` input_node_triples = calcjob_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) for input_node_triple in input_node_triples: @@ -402,46 +399,7 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode, output_path: Path, flat: if not flat: input_node_path = output_path / Path(*input_node_triple.link_label.split('__')) else: + # Don't use link_label at all input_node_path = output_path input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) - - -def calcjob_presubmit_dump(calcjob_node: CalcJobNode, output_path: Path): - """ - Dump inputs of a `CalcJobNode` using the `presubmit` function. - - :param process: The `CalcJobNode` whose inputs need to be dumped. - :param output_path: The path where the inputs will be dumped. - """ - - builder_restart = calcjob_node.get_builder_restart() - runner = get_manager().get_runner() - calcjob_process: CalcJob = instantiate_process(runner, builder_restart) # type: ignore[assignment] - - # `presubmit` calls `prepare_for_submission` internally - calc_info = calcjob_process.presubmit(folder=Folder(abspath=output_path)) - - try: - # Hackish way to modify local copy list so that the pseudos are actually dumped where I want them to. Otherwise - # they - # end up in home... - local_copy_list = calc_info['local_copy_list'].copy() - # print('LOCAL_COPY_LIST', local_copy_list) - new_local_copy_list = [tuple(list(local_copy_list[0][:2]) + [str(output_path / local_copy_list[0][-1])])] - calc_info['local_copy_list'] = new_local_copy_list - # print('NEW_LOCAL_COPY_LIST', new_local_copy_list) - except IndexError: - # This happens if `local_copy_list` is empty - pass - - with LocalTransport() as transport: - new_calcjob_node: CalcJobNode = calcjob_process.node - upload_calculation( - node=new_calcjob_node, - transport=transport, - calc_info=calc_info, - folder=Folder(abspath=output_path), - inputs=calcjob_process.inputs, - dry_run=False, - ) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 0e6823cdf0..1446d71e9b 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -17,6 +17,7 @@ import shutil from pathlib import Path +import pytest from aiida.tools.dumping.processes import ( calcjob_dump, calcjob_node_inputs_dump, @@ -25,14 +26,20 @@ process_dump, ) +# Define parameters for the dumping filename = 'file.txt' - -# Define some variables used for the dumping +filecontent = 'a' +raw_inputs_relpath = 'raw_inputs' +raw_outputs_relpath = 'raw_outputs' node_inputs_relpath = 'node_inputs' +default_dump_paths = [raw_inputs_relpath, raw_outputs_relpath, node_inputs_relpath] +custom_dump_paths = [f'{path}_' for path in default_dump_paths] + +# Define some variables used for constructing the nodes used to test the dumping singlefiledata_linklabel = 'singlefile_input' folderdata_linklabel = 'folderdata_input' -singlefiledata_path = pathlib.Path(f'{node_inputs_relpath}/{singlefiledata_linklabel}') -folderdata_path = pathlib.Path(f'{node_inputs_relpath}/{folderdata_linklabel}/relative_path') +folderdata_relative_path = 'relative_path' +folderdata_path = pathlib.Path(f'{folderdata_linklabel}/{folderdata_relative_path}') # ? Move this somewhere else @@ -53,53 +60,108 @@ def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): """Test that dumping of CalcJob node inputs works correctly.""" calcjob_node = generate_calcjob_node_io() + tmp_path_nested = tmp_path / node_inputs_relpath - # Run the dumping - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path, inputs_relpath=node_inputs_relpath) + # Test the dumping results with flat=False + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path_nested, flat=False) - # Test the dumping results - singlefiledata_outputpath = pathlib.Path(tmp_path / singlefiledata_path) + singlefiledata_outputpath = tmp_path_nested / singlefiledata_linklabel singlefiledata_outputfile = singlefiledata_outputpath / filename - - folderdata_outputpath = pathlib.Path(tmp_path / folderdata_path) + folderdata_outputpath = tmp_path_nested / folderdata_path folderdata_outputfile = folderdata_outputpath / filename assert singlefiledata_outputpath.is_dir() assert singlefiledata_outputfile.is_file() with open(singlefiledata_outputfile, 'r') as handle: - assert handle.read() == 'a' + assert handle.read() == filecontent assert folderdata_outputpath.is_dir() assert folderdata_outputfile.is_file() with open(folderdata_outputfile, 'r') as handle: - assert handle.read() == 'a' + assert handle.read() == filecontent + # Probably not actually necessary, as in the previous step they are dumped to `node_inputs` + clean_tmp_path(tmp_path=tmp_path) -def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): - dump_path = tmp_path / 'calcjob_dump_io' + # Test the dumping results with flat=True + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path, flat=True) - singlefiledata_outputpath = pathlib.Path(dump_path / singlefiledata_path) - singlefiledata_outputfile = singlefiledata_outputpath / filename + singlefiledata_outputfile = tmp_path / filename - calcjob_node = generate_calcjob_node_io() - raw_input_file = dump_path / 'raw_inputs' / filename - raw_output_file = dump_path / 'raw_outputs' / filename + # Flat=True doesn't flatten nested directory structure of FolderData objects -> Leave relative path + folderdata_outputpath = tmp_path / folderdata_relative_path + folderdata_outputfile = folderdata_outputpath / filename - # Normal dumping - result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path) assert singlefiledata_outputfile.is_file() + with open(singlefiledata_outputfile, 'r') as handle: + assert handle.read() == filecontent + + assert folderdata_outputpath.is_dir() + assert folderdata_outputfile.is_file() + with open(folderdata_outputfile, 'r') as handle: + assert handle.read() == filecontent + +def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): + dump_path = tmp_path / 'calcjob_dump_io' + + calcjob_node = generate_calcjob_node_io() + + # todo: Test for _LOGGER.info outputs + # todo: Replace repititions for raw_input/output and node_inputs with loops # Checking the actual content should be handled by `test_copy_tree` + # Not testing for the folderdata-input here, as this should be covered by `test_calcjob_node_inputs_dump` + # It is dumped to 'relative_path/file.txt' in all cases, though, but just ignore + + # Normal dumping -> node_inputs and not flat; no paths provided + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path) + + raw_input_file = dump_path / default_dump_paths[0] / filename + raw_output_file = dump_path / default_dump_paths[1] / filename + node_inputs_file = dump_path / default_dump_paths[2] / singlefiledata_linklabel / filename assert raw_input_file.is_file() assert raw_output_file.is_file() - assert result + assert node_inputs_file.is_file() + + clean_tmp_path(tmp_path=tmp_path) + + # Normal dumping -> node_inputs and not flat; custom paths provided + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, io_dump_paths=custom_dump_paths) + assert (dump_path / custom_dump_paths[0] / filename).is_file() # raw_inputs + assert (dump_path / custom_dump_paths[1] / filename).is_file() # raw_outputs + assert (dump_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() # node_inputs, singlefile + + # Flat dumping -> no node_inputs and no paths provided -> Default paths should not be existent + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, flat=True) + assert not (dump_path / default_dump_paths[0] / filename).is_file() # raw_inputs + assert not (dump_path / default_dump_paths[1] / filename).is_file() # raw_outputs + assert not (dump_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile + # Here, the same file will be written by raw_inputs and raw_outputs and node_inputs + # So it should only be present in the parent dump directory + assert (dump_path / filename).is_file() + + clean_tmp_path(tmp_path=tmp_path) + + # Flat dumping -> node_inputs and custom paths provided -> Test in custom paths + # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test + # todo: setup. This might be again too specific for QE? Like this, it's basically the same as the non-flat custom + # todo: path test above + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, io_dump_paths=custom_dump_paths, flat=True) + assert (dump_path / custom_dump_paths[0] / filename).is_file() # raw_inputs + assert (dump_path / custom_dump_paths[1] / filename).is_file() # raw_outputs + assert (dump_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile + # Could be shorter in that case, but not all of them, and listed might be more clear + # assert all((dump_path / path / filename).is_file() for path in custom_dump_paths) clean_tmp_path(tmp_path=tmp_path) # Don't dump the connected node inputs - result = calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, include_inputs=True) - assert not singlefiledata_outputfile.is_file() - assert result + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, include_inputs=False) + assert not (dump_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() + + # Test that it fails when it tries to overwrite without overwrite=True + with pytest.raises(FileExistsError): + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, overwrite=False) def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): @@ -109,13 +171,12 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme add_node = generate_arithmetic_add_node(computer=aiida_localhost) # Normal dumping of ArithmeticAddCalculation node - result = calcjob_dump(calcjob_node=add_node, output_path=dump_path) - assert result + calcjob_dump(calcjob_node=add_node, output_path=dump_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - raw_input_files = [dump_path / 'raw_inputs' / raw_input_file for raw_input_file in raw_input_files] - raw_output_files = [dump_path / 'raw_outputs' / raw_output_file for raw_output_file in raw_output_files] + raw_input_files = [dump_path / default_dump_paths[0] / raw_input_file for raw_input_file in raw_input_files] + raw_output_files = [dump_path / default_dump_paths[1] / raw_output_file for raw_output_file in raw_output_files] assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) @@ -143,9 +204,8 @@ def test_process_dump_io(generate_work_chain_io, tmp_path): # Here, when setting `output_path=tmp_path`, no parent directory for the parent workchain is created # Therefore, go into tmp-directory used for testing, without specifying output path -> Closer to how people might # actually use the function - result = process_dump(process_node=wc_node, output_path=dump_parent_path) + process_dump(process_node=wc_node, output_path=dump_parent_path) - assert result assert all([expected_file.is_file() for expected_file in expected_files]) @@ -156,17 +216,16 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l # Now test for output from running MultiplyAddWorkChain multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - result = process_dump(process_node=multiply_add_node, output_path=dump_parent_path) - assert result + process_dump(process_node=multiply_add_node, output_path=dump_parent_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] raw_input_files = [ - dump_parent_path / '01-ArithmeticAddCalculation' / 'raw_inputs' / raw_input_file + dump_parent_path / '01-ArithmeticAddCalculation' / default_dump_paths[0] / raw_input_file for raw_input_file in raw_input_files ] raw_output_files = [ - dump_parent_path / '01-ArithmeticAddCalculation' / 'raw_outputs' / raw_output_file + dump_parent_path / '01-ArithmeticAddCalculation' / default_dump_paths[1] / raw_output_file for raw_output_file in raw_output_files ] From 009b2aafafc4f22b553f895f01975739d27ba726 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 25 Apr 2024 10:02:24 +0200 Subject: [PATCH 10/30] Finalized tests apart from YAML dumping - Moved both `genereate_..._io` functions to the end of the file - Extended `generate_workchain_io` fixture to allow adding multiple `calcjob_nodes` to test that the flat dumping breaks in that case - Currently, when dumping the MultiplyAddWorkchain flat, the `source_file` of the `multiply` step is missing -> Still need to figure that one out --- src/aiida/tools/dumping/processes.py | 123 +++++---- tests/conftest.py | 106 ++++--- tests/tools/dumping/test_processes.py | 383 ++++++++++++++++++++------ 3 files changed, 420 insertions(+), 192 deletions(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index f49187bdb8..97127fb639 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -26,6 +26,7 @@ WorkFunctionNode, ) from aiida.orm.utils import LinkTriple +from aiida.repository import File _LOGGER = logging.getLogger(__name__) @@ -97,12 +98,12 @@ def dump_yaml( pass # Add node attributes - if self.include_attributes is True: + if self.include_attributes: node_attributes = process_node.base.attributes.all node_dict['Node attributes'] = node_attributes # Add node extras - if self.include_extras is True: + if self.include_extras: node_extras = process_node.base.extras.all if node_extras: node_dict['Node extras'] = node_extras @@ -163,6 +164,7 @@ def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode | CalcF def validate_make_dump_path( path: Path, overwrite: bool = False, + safeguard_file: str = '.aiida_node_metadata.yaml' ) -> Path: """ Create default dumping directory for a given process node and return it as absolute path. @@ -172,10 +174,6 @@ def validate_make_dump_path( """ import shutil - # ? Use of `echo.echo_` only when running via `verdi`? -> I only see it used in the `cmd_` files. - if path is None: - raise ValueError('Path not set.') - if path.is_dir(): # Existing, but empty directory => OK if not any(path.iterdir()): @@ -187,41 +185,59 @@ def validate_make_dump_path( # Existing and non-empty directory and overwrite True => Check for '.aiida_node_metadata.yaml' for safety # '.aiida_node_metadata.yaml' present => Remove directory - elif (path / '.aiida_node_metadata.yaml').is_file(): + elif (path / safeguard_file).is_file(): _LOGGER.info(f'Overwrite set to true, will overwrite directory `{path}`.') shutil.rmtree(path) path.mkdir(parents=True, exist_ok=False) - # Existing and non-empty directory and overwrite True => Check for README for safety - # '.aiida_node_metadata.yaml' absent => Remove directory + # Existing and non-empty directory and overwrite True => Check for safeguard_file (e.g. + # '.aiida_node_metadata.yaml') for safety reasons (don't wont to recursively delete wrong directory...) else: - _LOGGER.critical( - f"`{path}` Path exists but no `.aiida_node_metadata.yaml` found. Won't delete for security.\n" - # f'Manually remove existing `{path}` and dump again.' + # _LOGGER.critical( + # f"`{path}` Path exists but no `.aiida_node_metadata.yaml` found. Won't delete for security.\n" + # # f'Manually remove existing `{path}` and dump again.' + # ) + raise FileExistsError( + f"Path `{path}` already exists and doesn't contain `.aiida_node_metadata.yaml`. Not removing for safety reasons." ) + # Not included in else as to avoid having to repeat the `mkdir` call. `exist_ok=True` as checks implemented above path.mkdir(exist_ok=True, parents=True) return path.resolve() -def generate_node_input_label(index: int, link_triple: LinkTriple) -> str: +def generate_node_input_label(index: int, link_triple: LinkTriple, flat: bool = False) -> str: """Small helper function to generate the directory label for node inputs.""" node = link_triple.node link_label = link_triple.link_label # Generate directories with naming scheme akin to `verdi process status` - node_label = f'{index:02d}-{link_label}' + if not flat: + # node_label = f'{index:02d}-{link_label}' + label_list = [f'{index:02d}', link_label] - try: - process_label = node.process_label - if process_label is not None and process_label != link_label: - node_label += f'-{process_label}' + try: + process_label = node.process_label + if process_label is not None and process_label != link_label: + label_list += [process_label] + # node_label += f'-{process_label}' - except AttributeError: - process_type = node.process_type - if process_type is not None and process_type != link_label: - node_label += f'-{process_type}' + except AttributeError: + process_type = node.process_type + if process_type is not None and process_type != link_label: + label_list += [process_type] + # node_label += f'-{process_type}' + + else: + label_list = [] + + if isinstance(node, File): + label_list += [node.name] + + node_label = '-'.join(label_list) + # `CALL-` as part of the link labels also for MultiplyAddWorkChain, so remove for generality + node_label = node_label.replace('CALL-', '') return node_label @@ -250,6 +266,7 @@ def calcjob_dump( try: validate_make_dump_path(path=output_path, overwrite=overwrite) except: + # raise same exception here to communicate it outwards raise default_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] @@ -306,11 +323,10 @@ def process_dump( :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new instance will be created. Defaults to None. - :return: bool """ - # ? Realized during testing: If no path provided, only for the sub-workchain an additional directory is created, but - # ? should also create one here, if the function is imported normally and used in Python scripts + # Realized during testing: If no path provided, only for the sub-workchain an additional directory is created, but + # should also create one here, if the function is imported normally and used in Python scripts if output_path is None: output_path = generate_default_dump_path(process_node=process_node) @@ -320,13 +336,13 @@ def process_dump( if node_dumper is None: node_dumper = ProcessNodeYamlDumper() - # Need to dump for `WorkChainNode`s, as well, otherwise only `CalcJobNode`s + # Need to dump for parent ProcessNode, as well, otherwise no metadata file in parent ProcessNode directory node_dumper.dump_yaml(process_node=process_node, output_path=output_path.resolve()) # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive - # `process_dump` function which is the only one called from `verdi`, then I need to dump for a `CalcJob` here, as - # well. Also, if I want to be able to use `process_dump` from within the Python API - if isinstance(process_node, CalcJobNode): + # `process_dump` function called by `verdi`, then I need to dump for a `CalcJob` here, as + # well. Also, if I want to be able to use `process_dump` via the Python API + if isinstance(process_node, (CalcFunctionNode, CalcJobNode)): calcjob_dump( calcjob_node=process_node, output_path=output_path, @@ -337,40 +353,42 @@ def process_dump( ) # Recursive call for WorkChainNode - elif isinstance(process_node, WorkChainNode): - # Don't increment index for `ProcessNodes` that don't (always?) have file IO - # (`CalcFunctionNodes`/`WorkFunctionNodes`), such as `create_kpoints_from_distance` + # todo: Rather than checking for both, I could check for subclass of WorkFlowNode + elif isinstance(process_node, (WorkChainNode, WorkFunctionNode)): + called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() - # todo: Add check if flat is True, and multiple sub-workchains, that it raises exception. Otherwise, if - # todo: only one top-level workchain, dump everything from the single calcjob that was called in the - # todo: workchain in the main workchain directory - called_descendants = process_node.called_descendants - if flat and [isinstance(node, CalcJobNode) for node in called_descendants].count(True) > 1: + # If multiple CalcJobs contained in Workchain flat=True doesn't make sense as files would be overwritten + # -> Well, if different CalcJobs run, it could still make sense, but would one really want all these files in + # one flat directory? + called_calcjobs = [isinstance(node, CalcJobNode) for node in process_node.called_descendants] + if flat and called_calcjobs.count(True) > 1: + # Add error message here or when capturing `NotImplementedError` raise NotImplementedError - for index, link_triple in enumerate( - sorted(called_links, key=lambda link_triple: link_triple.node.ctime), start=1 - ): + sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) + + for index, link_triple in enumerate(sorted_called_links, start=1): child_node = link_triple.node if not flat: - child_label = generate_node_input_label(index=index, link_triple=link_triple) + child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) else: - child_label = '' + #test + child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) + # child_label = '' child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkChainNode`` # Not sure if the next two cases work for `WorkFunction` and `CalcFuncion``Node`s if isinstance(child_node, (WorkChainNode, WorkFunctionNode)): - if flat: - process_dump( - process_node=child_node, - output_path=child_output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - flat=flat, - ) + process_dump( + process_node=child_node, + output_path=child_output_path, + include_inputs=include_inputs, + node_dumper=node_dumper, + overwrite=overwrite, + flat=flat, + ) elif isinstance(child_node, (CalcJobNode, CalcFunctionNode)): calcjob_dump( @@ -399,7 +417,8 @@ def calcjob_node_inputs_dump(calcjob_node: CalcJobNode | CalcFunctionNode, outpu if not flat: input_node_path = output_path / Path(*input_node_triple.link_label.split('__')) else: - # Don't use link_label at all + # Don't use link_label at all -> But, relative path inside FolderData is retained + # ! This is not the issue why `source_file` is not dumped for MultiplyAddWorkChain when flat=True input_node_path = output_path input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) diff --git a/tests/conftest.py b/tests/conftest.py index c2fa03dd01..4efa836f6e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -676,6 +676,51 @@ def reset_log_level(): log.configure_logging(with_orm=True) +@pytest.fixture +def generate_arithmetic_add_node(): + def _generate_arithmetic_add_node(computer): + from aiida.engine import run_get_node + from aiida.orm import InstalledCode, Int + from aiida.plugins import CalculationFactory + + arithmetic_add = CalculationFactory('core.arithmetic.add') + + add_inputs = { + 'x': Int(1), + 'y': Int(2), + 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + } + + _, add_node = run_get_node(arithmetic_add, **add_inputs) + + return add_node + + return _generate_arithmetic_add_node + + +@pytest.fixture +def generate_multiply_add_node(): + def _generate_multiply_add_node(computer): + from aiida.engine import run_get_node + from aiida.orm import InstalledCode, Int + from aiida.plugins import WorkflowFactory + + multiplyaddworkchain = WorkflowFactory('core.arithmetic.multiply_add') + + multiply_add_inputs = { + 'x': Int(1), + 'y': Int(2), + 'z': Int(3), + 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + } + + _, multiply_add_node = run_get_node(multiplyaddworkchain, **multiply_add_inputs) + + return multiply_add_node + + return _generate_multiply_add_node + + @pytest.fixture def generate_calcjob_node_io(): def _generate_calcjob_node_io( @@ -737,74 +782,25 @@ def _generate_calcjob_node_io( @pytest.fixture -def generate_arithmetic_add_node(): - def _generate_arithmetic_add_node(computer): - from aiida.engine import run_get_node - from aiida.orm import InstalledCode, Int - from aiida.plugins import CalculationFactory - - arithmetic_add = CalculationFactory('core.arithmetic.add') - - add_inputs = { - 'x': Int(1), - 'y': Int(2), - 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), - } - - _, add_node = run_get_node(arithmetic_add, **add_inputs) - - return add_node - - return _generate_arithmetic_add_node - - -@pytest.fixture -def generate_multiply_add_node(): - def _generate_multiply_add_node(computer): - from aiida.engine import run_get_node - from aiida.orm import InstalledCode, Int - from aiida.plugins import WorkflowFactory - - multiplyaddworkchain = WorkflowFactory('core.arithmetic.multiply_add') - - multiply_add_inputs = { - 'x': Int(1), - 'y': Int(2), - 'z': Int(3), - 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), - } - - _, multiply_add_node = run_get_node(multiplyaddworkchain, **multiply_add_inputs) - - return multiply_add_node - - return _generate_multiply_add_node - - -@pytest.fixture -def generate_work_chain_io(generate_calcjob_node_io): - def _generate_work_chain_io(): +def generate_workchain_node_io(): + def _generate_workchain_node_io(cj_nodes): """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `CalcJob` with file io.""" from aiida.orm import WorkChainNode wc_node = WorkChainNode() wc_node_sub = WorkChainNode() - cj_node = generate_calcjob_node_io(attach_outputs=False) - - # wc_node.process_label = 'main' - # wc_node_sub.process_label = 'sub' - # cj_node.process_label = 'calc' # Add sub-workchain that calls a calcjob wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workchain') - cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calcjob') + for i, cj_node in enumerate(cj_nodes): + cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label=f'calcjob_{i}') # Need to store nodes so that the relationships are being picked up in the `get_outgoing` call (for # `get_incoming` they work without being stored) wc_node.store() wc_node_sub.store() - cj_node.store() + [cj_node.store() for cj_node in cj_nodes] return wc_node - return _generate_work_chain_io + return _generate_workchain_node_io diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 1446d71e9b..151327083f 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -24,6 +24,7 @@ generate_default_dump_path, generate_node_input_label, process_dump, + validate_make_dump_path, ) # Define parameters for the dumping @@ -38,11 +39,11 @@ # Define some variables used for constructing the nodes used to test the dumping singlefiledata_linklabel = 'singlefile_input' folderdata_linklabel = 'folderdata_input' -folderdata_relative_path = 'relative_path' -folderdata_path = pathlib.Path(f'{folderdata_linklabel}/{folderdata_relative_path}') +folderdata_internal_path = 'relative_path' +folderdata_path = pathlib.Path(f'{folderdata_linklabel}/{folderdata_internal_path}') -# ? Move this somewhere else +# ? Move this somewhere else? def clean_tmp_path(tmp_path: Path): """ Recursively delete files and directories in a path, e.g. a temporary path used by pytest. @@ -60,108 +61,169 @@ def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): """Test that dumping of CalcJob node inputs works correctly.""" calcjob_node = generate_calcjob_node_io() - tmp_path_nested = tmp_path / node_inputs_relpath + dump_parent_path = tmp_path / node_inputs_relpath - # Test the dumping results with flat=False - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path_nested, flat=False) + # Check the dumping results with flat=False - singlefiledata_outputpath = tmp_path_nested / singlefiledata_linklabel - singlefiledata_outputfile = singlefiledata_outputpath / filename - folderdata_outputpath = tmp_path_nested / folderdata_path - folderdata_outputfile = folderdata_outputpath / filename + # Expected tree: + # node_inputs + # ├── folderdata_input + # │ └── relative_path + # │ └── file.txt + # └── singlefile_input + # └── file.txt - assert singlefiledata_outputpath.is_dir() - assert singlefiledata_outputfile.is_file() - with open(singlefiledata_outputfile, 'r') as handle: + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + assert (dump_parent_path / singlefiledata_linklabel).is_dir() + assert (dump_parent_path / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / folderdata_path).is_dir() + assert (dump_parent_path / folderdata_path / filename).is_file() + + with open(dump_parent_path / singlefiledata_linklabel / filename, 'r') as handle: assert handle.read() == filecontent - assert folderdata_outputpath.is_dir() - assert folderdata_outputfile.is_file() - with open(folderdata_outputfile, 'r') as handle: + with open(dump_parent_path / folderdata_path / filename, 'r') as handle: assert handle.read() == filecontent # Probably not actually necessary, as in the previous step they are dumped to `node_inputs` clean_tmp_path(tmp_path=tmp_path) - # Test the dumping results with flat=True - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=tmp_path, flat=True) + # Check the dumping results with flat=True + + # Expected tree: + # node_inputs + # ├── file.txt + # └── relative_path + # └── file.txt - singlefiledata_outputfile = tmp_path / filename + calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) # Flat=True doesn't flatten nested directory structure of FolderData objects -> Leave relative path - folderdata_outputpath = tmp_path / folderdata_relative_path - folderdata_outputfile = folderdata_outputpath / filename + assert (dump_parent_path / folderdata_internal_path).is_dir() + assert (dump_parent_path / folderdata_internal_path / filename).is_file() - assert singlefiledata_outputfile.is_file() - with open(singlefiledata_outputfile, 'r') as handle: + assert (dump_parent_path / filename).is_file() + with open(dump_parent_path / filename, 'r') as handle: assert handle.read() == filecontent - assert folderdata_outputpath.is_dir() - assert folderdata_outputfile.is_file() - with open(folderdata_outputfile, 'r') as handle: + with open(dump_parent_path / folderdata_internal_path / filename, 'r') as handle: assert handle.read() == filecontent + # todo: test here with ArithmeticAdd as well + def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): - dump_path = tmp_path / 'calcjob_dump_io' + dump_parent_path = tmp_path / 'cj-dump-test-io' + + # Here, check for attached `retrieved` outputs, as well calcjob_node = generate_calcjob_node_io() # todo: Test for _LOGGER.info outputs - # todo: Replace repititions for raw_input/output and node_inputs with loops # Checking the actual content should be handled by `test_copy_tree` # Not testing for the folderdata-input here, as this should be covered by `test_calcjob_node_inputs_dump` # It is dumped to 'relative_path/file.txt' in all cases, though, but just ignore # Normal dumping -> node_inputs and not flat; no paths provided - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path) - raw_input_file = dump_path / default_dump_paths[0] / filename - raw_output_file = dump_path / default_dump_paths[1] / filename - node_inputs_file = dump_path / default_dump_paths[2] / singlefiledata_linklabel / filename - assert raw_input_file.is_file() - assert raw_output_file.is_file() - assert node_inputs_file.is_file() + # Expected tree: + # cj-dump-test-io + # ├── node_inputs + # │ ├── folderdata_input + # │ │ └── relative_path + # │ │ └── file.txt + # │ └── singlefile_input + # │ └── file.txt + # ├── raw_inputs + # │ └── file.txt + # └── raw_outputs + # └── file.txt + + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + + assert (dump_parent_path / default_dump_paths[0] / filename).is_file() + assert (dump_parent_path / default_dump_paths[1] / filename).is_file() + assert (dump_parent_path / default_dump_paths[2] / singlefiledata_linklabel / filename).is_file() clean_tmp_path(tmp_path=tmp_path) # Normal dumping -> node_inputs and not flat; custom paths provided - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, io_dump_paths=custom_dump_paths) - assert (dump_path / custom_dump_paths[0] / filename).is_file() # raw_inputs - assert (dump_path / custom_dump_paths[1] / filename).is_file() # raw_outputs - assert (dump_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() # node_inputs, singlefile - - # Flat dumping -> no node_inputs and no paths provided -> Default paths should not be existent - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, flat=True) - assert not (dump_path / default_dump_paths[0] / filename).is_file() # raw_inputs - assert not (dump_path / default_dump_paths[1] / filename).is_file() # raw_outputs - assert not (dump_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile + + # Expected tree: + # cj-dump-test-io + # ├── node_inputs_ + # │ ├── folderdata_input + # │ │ └── relative_path + # │ │ └── file.txt + # │ └── singlefile_input + # │ └── file.txt + # ├── raw_inputs_ + # │ └── file.txt + # └── raw_outputs_ + # └── file.txt + + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) + assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs + assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs + # node_inputs, singlefile + assert (dump_parent_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() + + clean_tmp_path(tmp_path=tmp_path) + + # Flat dumping -> no paths provided -> Default paths should not be existent. Internal FolderData structure retained. + # Expected tree: + # cj-dump-test-io + # ├── file.txt + # └── relative_path + # └── file.txt + + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) + assert not (dump_parent_path / default_dump_paths[0] / filename).is_file() # raw_inputs + assert not (dump_parent_path / default_dump_paths[1] / filename).is_file() # raw_outputs + assert not (dump_parent_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile # Here, the same file will be written by raw_inputs and raw_outputs and node_inputs # So it should only be present in the parent dump directory - assert (dump_path / filename).is_file() + assert (dump_parent_path / filename).is_file() clean_tmp_path(tmp_path=tmp_path) - # Flat dumping -> node_inputs and custom paths provided -> Test in custom paths + # Flat dumping -> node_inputs and custom paths provided -> Test in custom paths, + # But no subdirectories named after the link-labels under `node_inputs_` + # Expected path: + # cj-dump-test-io + # ├── node_inputs_ + # │ ├── file.txt + # │ └── relative_path + # │ └── file.txt + # ├── raw_inputs_ + # │ └── file.txt + # └── raw_outputs_ + # └── file.txt + # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test - # todo: setup. This might be again too specific for QE? Like this, it's basically the same as the non-flat custom - # todo: path test above - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, io_dump_paths=custom_dump_paths, flat=True) - assert (dump_path / custom_dump_paths[0] / filename).is_file() # raw_inputs - assert (dump_path / custom_dump_paths[1] / filename).is_file() # raw_outputs - assert (dump_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile - # Could be shorter in that case, but not all of them, and listed might be more clear - # assert all((dump_path / path / filename).is_file() for path in custom_dump_paths) + # todo: setup. This might be again too specific for QE? + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) + assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs + assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs + assert (dump_parent_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile + + clean_tmp_path(tmp_path=tmp_path) + + # Don't dump the connected node inputs for both, flat is True/False + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) + assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) - # Don't dump the connected node inputs - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, include_inputs=False) - assert not (dump_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) + assert not (dump_parent_path / custom_dump_paths[2]).is_dir() - # Test that it fails when it tries to overwrite without overwrite=True + clean_tmp_path(tmp_path=tmp_path) + + # Check that it fails when it tries to create the same directory without overwrite=True + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) with pytest.raises(FileExistsError): - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_path, overwrite=False) + calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): @@ -184,56 +246,153 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme clean_tmp_path(tmp_path=tmp_path) -def test_process_dump_io(generate_work_chain_io, tmp_path): - wc_node = generate_work_chain_io() +def test_process_dump_io(generate_calcjob_node_io, generate_workchain_node_io, tmp_path): + + # Expected tree: + # wc-dump-test-io + # └── 01-sub_workchain + # └── 01-calcjob + # ├── node_inputs + # │ ├── folderdata_input + # │ │ └── relative_path + # │ │ └── file.txt + # │ └── singlefile_input + # │ └── file.txt + # └── raw_inputs + # └── file.txt - dump_parent_path = tmp_path / 'wc-dump-test' + # Don't attach outputs, as this would require storing the calcjob_node, and it cannot be added. Dumping of outputs + # should be taken care of by `test_calcjob_dump` + cj_node = generate_calcjob_node_io(attach_outputs=False) + wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) - raw_input_path = '01-sub_workchain/01-calcjob/raw_inputs/file.txt' - singlefiledata_path = '01-sub_workchain/01-calcjob/node_inputs/singlefile_input/file.txt' - folderdata_path = '01-sub_workchain/01-calcjob/node_inputs/folderdata_input/relative_path/file.txt' + # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` + # Other option would be to cd into `tmp_path` and then letting the default label be created + dump_parent_path = tmp_path / 'wc-dump-test-io' + + # Don't test for `README` here, as this is only created when dumping is done via `verdi` + raw_input_path = '01-sub_workchain/01-calcjob_0/raw_inputs/file.txt' + singlefiledata_path = '01-sub_workchain/01-calcjob_0/node_inputs/singlefile_input/file.txt' + folderdata_path = '01-sub_workchain/01-calcjob_0/node_inputs/folderdata_input/relative_path/file.txt' node_metadata_paths = [ '.aiida_node_metadata.yaml', '01-sub_workchain/.aiida_node_metadata.yaml', - '01-sub_workchain/01-calcjob/.aiida_node_metadata.yaml', + '01-sub_workchain/01-calcjob_0/.aiida_node_metadata.yaml', ] - # Don't test for `README` here, as this is only created when dumping is done via `verdi` + expected_files = [raw_input_path, singlefiledata_path, folderdata_path, *node_metadata_paths] expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - # Here, when setting `output_path=tmp_path`, no parent directory for the parent workchain is created - # Therefore, go into tmp-directory used for testing, without specifying output path -> Closer to how people might - # actually use the function process_dump(process_node=wc_node, output_path=dump_parent_path) assert all([expected_file.is_file() for expected_file in expected_files]) + clean_tmp_path(tmp_path=dump_parent_path) + + # Check directory tree when flat=True + + # Expected tree: + # wc-dump-test-io + # ├── file.txt + # └── relative_path + # └── file.txt + + process_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) + assert (dump_parent_path / filename).is_file() + # Internal hierarchy of the FolderData is retained + assert (dump_parent_path / folderdata_internal_path / filename).is_file() + + clean_tmp_path(tmp_path=dump_parent_path) + + # Check that dumping fails if multiple CalcJobs run by the workchain if flat=True + cj_nodes = [generate_calcjob_node_io(attach_outputs=False), generate_calcjob_node_io(attach_outputs=False)] + wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) + with pytest.raises(NotImplementedError): + process_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) + def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): - # Still set directory fixed to make dump directory reproducible (it should be anyway, but contains e.g. the pk) + + # Testing for files in hidden .aiida folder here, but not in more complex io functions dump_parent_path = tmp_path / 'multiply_add-dump-test' - # Now test for output from running MultiplyAddWorkChain multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) + # Dump with flat=True + # Expected tree: + # multiply_add-dump-test + # ├── .aiida + # │ ├── calcinfo.json + # │ └── job_tmpl.json + # ├── .aiida_node_metadata.yaml + # ├── _aiidasubmit.sh + # ├── _scheduler-stderr.txt + # ├── _scheduler-stdout.txt + # ├── aiida.in + # ├── aiida.out + #!└── source_file missing + + process_dump(process_node=multiply_add_node, output_path=dump_parent_path, flat=True) + + raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + # raw_input_files += ['source_file'] + raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + raw_input_files = [dump_parent_path / raw_input_file for raw_input_file in raw_input_files] + raw_output_files = [dump_parent_path / raw_output_file for raw_output_file in raw_output_files] + + print(multiply_add_node.called_descendants) + print(multiply_add_node.called_descendants[0]) + print(type(multiply_add_node.called_descendants[0])) + print(multiply_add_node.called_descendants[0].base.repository.list_objects()) + print(multiply_add_node.called_descendants[0].base.repository.list_object_names()) + + # ! source_file is missing -> Why? + assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) + assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + + clean_tmp_path(tmp_path=tmp_path) + + # Dump with flat=False + # Expected tree: + # multiply_add-dump-test + # ├── 01-multiply + # │ └── raw_inputs + # │ └── source_file + # └── 02-ArithmeticAddCalculation + # ├── raw_inputs + # │ ├── .aiida + # │ │ ├── calcinfo.json + # │ │ └── job_tmpl.json + # │ ├── _aiidasubmit.sh + # │ └── aiida.in + # └── raw_outputs + # ├── _scheduler-stderr.txt + # ├── _scheduler-stdout.txt + # └── aiida.out + process_dump(process_node=multiply_add_node, output_path=dump_parent_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] raw_input_files = [ - dump_parent_path / '01-ArithmeticAddCalculation' / default_dump_paths[0] / raw_input_file + dump_parent_path / '02-ArithmeticAddCalculation' / default_dump_paths[0] / raw_input_file for raw_input_file in raw_input_files ] + raw_input_files += [dump_parent_path / '01-multiply' / default_dump_paths[0] / 'source_file'] raw_output_files = [ - dump_parent_path / '01-ArithmeticAddCalculation' / default_dump_paths[1] / raw_output_file + dump_parent_path / '02-ArithmeticAddCalculation' / default_dump_paths[1] / raw_output_file for raw_output_file in raw_output_files ] + # No node_inputs contained in MultiplyAddWorkChain assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + + def test_generate_default_dump_path(generate_arithmetic_add_node, generate_multiply_add_node, aiida_localhost): + add_node = generate_arithmetic_add_node(computer=aiida_localhost) multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) add_path = generate_default_dump_path(process_node=add_node) @@ -243,26 +402,80 @@ def test_generate_default_dump_path(generate_arithmetic_add_node, generate_multi assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' + # todo: test for io_function? -def test_generate_node_input_label(generate_multiply_add_node, generate_work_chain_io, aiida_localhost): - # Test with MultiplyAddWorkChain inputs and outputs - # Test with manually constructed, more complex workchain - wc_node = generate_work_chain_io() +def test_generate_node_input_label( + generate_multiply_add_node, generate_calcjob_node_io, generate_workchain_node_io, aiida_localhost +): + # Check with manually constructed, more complex workchain + cj_node = generate_calcjob_node_io(attach_outputs=False) + wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) wc_output_triples = wc_node.base.links.get_outgoing().all() sub_wc_node = wc_output_triples[0].node output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() - output_labels = [generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)] - assert output_labels == ['00-sub_workchain', '01-calcjob'] + output_labels = sorted([generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)]) + assert output_labels == ['00-sub_workchain', '01-calcjob_0'] + + # Check with multiply_add workchain node + multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) + output_triples = multiply_add_node.base.links.get_outgoing().all() + output_labels = sorted([generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)]) + assert output_labels == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] + + +def test_validate_make_dump_path(chdir_tmp_path, tmp_path): + + chdir_tmp_path - # ? Not really testing for more complex cases that actually contain 'CALL' or 'iteration_' here + test_dir = Path('test-dir') + test_dir_abs = tmp_path / test_dir + safeguard_file = '.aiida_node_metadata.yaml' + # Path must be provided + with pytest.raises(TypeError): + validate_make_dump_path() + + # Check if path created if non-existent + output_path = validate_make_dump_path(path=test_dir) + assert output_path == test_dir_abs + + clean_tmp_path(tmp_path=tmp_path) + + # Empty path is fine -> No error and full path returned + test_dir_abs.mkdir() + output_path = validate_make_dump_path(path=test_dir) + assert output_path == test_dir_abs + + clean_tmp_path(tmp_path=tmp_path) + + # Fails if directory not empty and overwrite set to False + test_dir_abs.mkdir() + (test_dir_abs / filename).touch() + with pytest.raises(FileExistsError): + output_path = validate_make_dump_path(path=test_dir) + assert (test_dir_abs / filename).is_file() + + clean_tmp_path(tmp_path=tmp_path) + + # Fails if directory not empty and overwrite set to True, but safeguard_file not found (for safety reasons) + test_dir_abs.mkdir() + (test_dir_abs / filename).touch() + with pytest.raises(FileExistsError): + output_path = validate_make_dump_path(path=test_dir, overwrite=True) + assert (test_dir_abs / filename).is_file() + + clean_tmp_path(tmp_path=tmp_path) -def test_validate_make_dump_path(chdir_tmp_path): - pass + # Works if directory not empty, but overwrite=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained + test_dir_abs.mkdir() + (test_dir_abs / safeguard_file).touch() + output_path = validate_make_dump_path(path=test_dir, overwrite=True, safeguard_file=safeguard_file) + assert output_path == test_dir_abs + assert not (test_dir_abs / safeguard_file).is_file() def test_dump_yaml(): - pass + assert False From 5b0e97f5cdfb7f7c98ff4b0cfeb6ceabb61b5b1a Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 25 Apr 2024 10:20:42 +0200 Subject: [PATCH 11/30] Output of `process status/report/show/` to README --- src/aiida/cmdline/commands/cmd_process.py | 2 +- src/aiida/tools/dumping/processes.py | 37 +++++++++++++++++++---- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 45a0e773c5..5a5af7426a 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -561,4 +561,4 @@ def dump( # Create README in parent directory # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and # process_dump files such that they can also be used from within the Python API, not just via verdi - make_dump_readme(output_path=output_path, process_node=process) \ No newline at end of file + make_dump_readme(output_path=output_path, process_node=process) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 97127fb639..1a8bbbf971 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -121,6 +121,15 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): :param output_path: Output path for dumping. """ + + from aiida.cmdline.utils.ascii_vis import format_call_graph + from aiida.cmdline.utils.common import ( + get_calcjob_report, + get_node_info, + get_process_function_report, + get_workchain_report, + ) + _readme_string = textwrap.dedent( f"""\ This directory contains the files involved in the simulation/workflow `{process_node.process_label} <{process_node.pk}>` run with AiiDA. @@ -138,7 +147,26 @@ def make_dump_readme(process_node: ProcessNode, output_path: Path): node data for further inspection.""" # noqa: E501 ) - # TODO: Add outputs of `verdi process (status|report|show?)` + # `verdi process status` + process_status = format_call_graph(calc_node=process_node, max_depth=None, call_link_label=True) + _readme_string += f'\n\nOutput of `verdi process status`\n\n{process_status}' + + # `verdi process report` + # Copied over from `cmd_process` + if isinstance(process_node, CalcJobNode): + process_report = get_calcjob_report(process_node) + elif isinstance(process_node, WorkChainNode): + process_report = get_workchain_report(process_node, levelname='REPORT', indent_size=2, max_depth=None) + elif isinstance(process_node, (CalcFunctionNode, WorkFunctionNode)): + process_report = get_process_function_report(process_node) + else: + process_report = f'Nothing to show for node type {process_node.__class__}' + + _readme_string += f'\n\nOutput of `verdi process report`\n\n{process_report}' + + # `verdi process show`? + process_show = get_node_info(node=process_node) + _readme_string += f'\n\nOutput of `verdi process show`\n\n{process_show}' (output_path / 'README').write_text(_readme_string) @@ -162,9 +190,7 @@ def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode | CalcF # ? Could move this to `cmdline/utils` def validate_make_dump_path( - path: Path, - overwrite: bool = False, - safeguard_file: str = '.aiida_node_metadata.yaml' + path: Path, overwrite: bool = False, safeguard_file: str = '.aiida_node_metadata.yaml' ) -> Path: """ Create default dumping directory for a given process node and return it as absolute path. @@ -355,7 +381,6 @@ def process_dump( # Recursive call for WorkChainNode # todo: Rather than checking for both, I could check for subclass of WorkFlowNode elif isinstance(process_node, (WorkChainNode, WorkFunctionNode)): - called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() # If multiple CalcJobs contained in Workchain flat=True doesn't make sense as files would be overwritten @@ -373,7 +398,7 @@ def process_dump( if not flat: child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) else: - #test + # test child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) # child_label = '' child_output_path = output_path.resolve() / child_label From 5dac992a2a19129a81a10db325fbf6876a03fd15 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 25 Apr 2024 10:26:06 +0200 Subject: [PATCH 12/30] Naming: `dump` -> `process_dump` in `cmd_process` To be consistent with other commands in `cmd_process`. And with that `process_dump` and `calcjob_dump` in `processes.py` to `process_node_dump` and `calcjob_node_dump`. --- src/aiida/cmdline/commands/cmd_process.py | 8 +++--- src/aiida/tools/dumping/processes.py | 12 ++++----- tests/tools/dumping/test_processes.py | 32 +++++++++++------------ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 5a5af7426a..1b1a58a41c 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -491,7 +491,7 @@ def process_repair(manager, broker, dry_run): @options.INCLUDE_EXTRAS() @options.OVERWRITE() @click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') -def dump( +def process_dump( process, path, include_inputs, @@ -519,7 +519,7 @@ def dump( ProcessNodeYamlDumper, generate_default_dump_path, make_dump_readme, - process_dump, + process_node_dump, validate_make_dump_path, ) @@ -536,7 +536,7 @@ def dump( processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) try: - process_dump( + process_node_dump( process_node=process, output_path=output_path, include_inputs=include_inputs, @@ -554,7 +554,7 @@ def dump( # # raise # echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') except NotImplementedError: - echo.echo_critical('flat dumping not supported for `WorkChains` that call more than one `CalcJob`.') + echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') except Exception as e: echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 1a8bbbf971..94eb185385 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -268,7 +268,7 @@ def generate_node_input_label(index: int, link_triple: LinkTriple, flat: bool = return node_label -def calcjob_dump( +def calcjob_node_dump( calcjob_node: CalcJobNode | CalcFunctionNode, output_path: Path | None, include_inputs: bool = True, @@ -299,7 +299,7 @@ def calcjob_dump( if flat and io_dump_paths is None: io_dump_paths = ['', '', ''] - _LOGGER.info('Flat set to True and no `io_dump_paths`. Dump in a flat directory, files might be overwritten.') + _LOGGER.info('Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.') # raise ValueError('Flat set to False but no io_dump_paths provided.') # -> Can still provide paths but use flat=True to not flatten the node_inputs -> Probably this is bad design... elif flat and io_dump_paths is not None: @@ -331,7 +331,7 @@ def calcjob_dump( node_dumper.dump_yaml(process_node=calcjob_node, output_path=output_path) -def process_dump( +def process_node_dump( process_node: WorkChainNode | CalcJobNode | WorkFunctionNode | CalcFunctionNode, output_path: Path | None, include_inputs: bool = True, @@ -369,7 +369,7 @@ def process_dump( # `process_dump` function called by `verdi`, then I need to dump for a `CalcJob` here, as # well. Also, if I want to be able to use `process_dump` via the Python API if isinstance(process_node, (CalcFunctionNode, CalcJobNode)): - calcjob_dump( + calcjob_node_dump( calcjob_node=process_node, output_path=output_path, include_inputs=include_inputs, @@ -406,7 +406,7 @@ def process_dump( # Recursive function call for `WorkChainNode`` # Not sure if the next two cases work for `WorkFunction` and `CalcFuncion``Node`s if isinstance(child_node, (WorkChainNode, WorkFunctionNode)): - process_dump( + process_node_dump( process_node=child_node, output_path=child_output_path, include_inputs=include_inputs, @@ -416,7 +416,7 @@ def process_dump( ) elif isinstance(child_node, (CalcJobNode, CalcFunctionNode)): - calcjob_dump( + calcjob_node_dump( calcjob_node=child_node, output_path=child_output_path, include_inputs=include_inputs, diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 151327083f..e481e8d081 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -19,11 +19,11 @@ import pytest from aiida.tools.dumping.processes import ( - calcjob_dump, + calcjob_node_dump, calcjob_node_inputs_dump, generate_default_dump_path, generate_node_input_label, - process_dump, + process_node_dump, validate_make_dump_path, ) @@ -139,7 +139,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs # └── file.txt - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) assert (dump_parent_path / default_dump_paths[0] / filename).is_file() assert (dump_parent_path / default_dump_paths[1] / filename).is_file() @@ -162,7 +162,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs_ # └── file.txt - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs # node_inputs, singlefile @@ -177,7 +177,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── relative_path # └── file.txt - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) assert not (dump_parent_path / default_dump_paths[0] / filename).is_file() # raw_inputs assert not (dump_parent_path / default_dump_paths[1] / filename).is_file() # raw_outputs assert not (dump_parent_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -202,7 +202,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test # todo: setup. This might be again too specific for QE? - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs assert (dump_parent_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -210,20 +210,20 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): clean_tmp_path(tmp_path=tmp_path) # Don't dump the connected node inputs for both, flat is True/False - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) # Check that it fails when it tries to create the same directory without overwrite=True - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) with pytest.raises(FileExistsError): - calcjob_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): @@ -233,7 +233,7 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme add_node = generate_arithmetic_add_node(computer=aiida_localhost) # Normal dumping of ArithmeticAddCalculation node - calcjob_dump(calcjob_node=add_node, output_path=dump_path) + calcjob_node_dump(calcjob_node=add_node, output_path=dump_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] @@ -283,7 +283,7 @@ def test_process_dump_io(generate_calcjob_node_io, generate_workchain_node_io, t expected_files = [raw_input_path, singlefiledata_path, folderdata_path, *node_metadata_paths] expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - process_dump(process_node=wc_node, output_path=dump_parent_path) + process_node_dump(process_node=wc_node, output_path=dump_parent_path) assert all([expected_file.is_file() for expected_file in expected_files]) @@ -297,7 +297,7 @@ def test_process_dump_io(generate_calcjob_node_io, generate_workchain_node_io, t # └── relative_path # └── file.txt - process_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) + process_node_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) assert (dump_parent_path / filename).is_file() # Internal hierarchy of the FolderData is retained assert (dump_parent_path / folderdata_internal_path / filename).is_file() @@ -308,7 +308,7 @@ def test_process_dump_io(generate_calcjob_node_io, generate_workchain_node_io, t cj_nodes = [generate_calcjob_node_io(attach_outputs=False), generate_calcjob_node_io(attach_outputs=False)] wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) with pytest.raises(NotImplementedError): - process_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) + process_node_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): @@ -332,7 +332,7 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l # ├── aiida.out #!└── source_file missing - process_dump(process_node=multiply_add_node, output_path=dump_parent_path, flat=True) + process_node_dump(process_node=multiply_add_node, output_path=dump_parent_path, flat=True) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] # raw_input_files += ['source_file'] @@ -370,7 +370,7 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l # ├── _scheduler-stdout.txt # └── aiida.out - process_dump(process_node=multiply_add_node, output_path=dump_parent_path) + process_node_dump(process_node=multiply_add_node, output_path=dump_parent_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] From 9b0877d8a48e7f55a6d917692c35d558cb5d35db Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 25 Apr 2024 11:03:20 +0200 Subject: [PATCH 13/30] Moved logic for calcjob_io_paths to own function Now in function `generate_calcjob_io_dump_paths`. Takes care of handling the `flat` argument and the naming of the `raw_inputs`, `raw_outputs`, and `node_inputs` subdirectories. --- src/aiida/cmdline/commands/cmd_process.py | 4 +- src/aiida/tools/dumping/processes.py | 60 +++++++++++++---------- tests/cmdline/commands/test_process.py | 22 +++++++++ 3 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 1b1a58a41c..0f7cfb1998 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -518,7 +518,7 @@ def process_dump( from aiida.tools.dumping.processes import ( ProcessNodeYamlDumper, generate_default_dump_path, - make_dump_readme, + generate_dump_readme, process_node_dump, validate_make_dump_path, ) @@ -561,4 +561,4 @@ def process_dump( # Create README in parent directory # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and # process_dump files such that they can also be used from within the Python API, not just via verdi - make_dump_readme(output_path=output_path, process_node=process) + generate_dump_readme(output_path=output_path, process_node=process) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 94eb185385..beed798726 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -114,7 +114,7 @@ def dump_yaml( # Utility functions -def make_dump_readme(process_node: ProcessNode, output_path: Path): +def generate_dump_readme(process_node: ProcessNode, output_path: Path): """Generate README file in main dumping directory. :param process_node: CalcJob or WorkChain Node. @@ -268,6 +268,30 @@ def generate_node_input_label(index: int, link_triple: LinkTriple, flat: bool = return node_label +def generate_calcjob_io_dump_paths(calcjob_io_dump_paths: list | None = None, flat: bool = False): + default_calcjob_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] + + if flat and calcjob_io_dump_paths is None: + calcjob_io_dump_paths = ['', '', ''] + _LOGGER.info( + 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' + ) + elif flat and calcjob_io_dump_paths is not None: + _LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.') + elif not flat and calcjob_io_dump_paths is None: + _LOGGER.info( + f'Flat set to False but no `io_dump_paths` provided. Will use the defaults {default_calcjob_io_dump_paths}.' + ) + calcjob_io_dump_paths = default_calcjob_io_dump_paths + # elif not flat and calcjob_io_dump_paths is not None: + else: + _LOGGER.info( + 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' + ) + + return calcjob_io_dump_paths + + def calcjob_node_dump( calcjob_node: CalcJobNode | CalcFunctionNode, output_path: Path | None, @@ -295,24 +319,7 @@ def calcjob_node_dump( # raise same exception here to communicate it outwards raise - default_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] - - if flat and io_dump_paths is None: - io_dump_paths = ['', '', ''] - _LOGGER.info('Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.') - # raise ValueError('Flat set to False but no io_dump_paths provided.') - # -> Can still provide paths but use flat=True to not flatten the node_inputs -> Probably this is bad design... - elif flat and io_dump_paths is not None: - _LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.') - elif not flat and io_dump_paths is None: - _LOGGER.info( - f'Flat set to False but no `io_dump_paths` provided. Will use the defaults: {default_io_dump_paths}.' - ) - io_dump_paths = default_io_dump_paths - elif not flat and io_dump_paths is not None: - _LOGGER.info( - 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' - ) + io_dump_paths = generate_calcjob_io_dump_paths(calcjob_io_dump_paths=io_dump_paths, flat=flat) calcjob_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) @@ -356,7 +363,10 @@ def process_node_dump( if output_path is None: output_path = generate_default_dump_path(process_node=process_node) - validate_make_dump_path(path=output_path, overwrite=overwrite) + try: + validate_make_dump_path(path=output_path, overwrite=overwrite) + except: + raise # This will eventually be replaced once pydantic backend PR merged if node_dumper is None: @@ -378,8 +388,6 @@ def process_node_dump( flat=flat, ) - # Recursive call for WorkChainNode - # todo: Rather than checking for both, I could check for subclass of WorkFlowNode elif isinstance(process_node, (WorkChainNode, WorkFunctionNode)): called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() @@ -398,9 +406,8 @@ def process_node_dump( if not flat: child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) else: - # test - child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) - # child_label = '' + child_label = '' + child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkChainNode`` @@ -425,6 +432,9 @@ def process_node_dump( flat=flat, ) + # todo: Add checks for `CalcFunctionNode` and `WorkFunctionNode` here specifically and implement the + # respective duming functions + # Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs def calcjob_node_inputs_dump(calcjob_node: CalcJobNode | CalcFunctionNode, output_path: Path, flat: bool = False): diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index 2b37750c67..0a7bd65f1d 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -336,6 +336,28 @@ def test_report(self, run_cli_command): assert len(result.output_lines) == 1, result.output_lines assert result.output_lines[0] == 'No log messages recorded for this entry' + def test_process_dump(self, run_cli_command): + """Test verdi process dump""" + node = WorkflowNode().store() + # node.set_process_state(ProcessState.RUNNING) + + # # Running without identifiers should not except and not print anything + # options = [] + # result = run_cli_command(cmd_process.process_status, options) + # assert result.exception is None, result.output + # assert len(result.output_lines) == 0 + + # # Giving a single identifier should print a non empty string message + # options = [str(node.pk)] + # result = run_cli_command(cmd_process.process_status, options) + # assert result.exception is None, result.output + # assert len(result.output_lines) > 0 + + # # With max depth 0, the output should be empty + # options = ['--max-depth', 0, str(node.pk)] + # result = run_cli_command(cmd_process.process_status, options) + # assert result.exception is None, result.output + # assert len(result.output_lines) == 0 @pytest.mark.usefixtures('aiida_profile_clean') @pytest.mark.parametrize('numprocesses, percentage', ((0, 100), (1, 90))) From af4a03e2f8bf4e5802bbeeeddc8a74701cde99dd Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Fri, 26 Apr 2024 14:54:06 +0200 Subject: [PATCH 14/30] test Changed `--flat` option to still create subdirectories for the individual steps of the WorkChain. Instead, just the subdirectories per CalcJob are removed. Generalized the dumping of outputs that it doesn't only dump `retrieved` -> With this, it's dumping a whole range of aiida nodes, basically all the parsed outputs, which are mainly numpy arrays dumped as `.npy` files. Add an option to enable this, as it might not be necessary to dump all of those. Currently, I just defined a global variable in the file, but this will eventually become a class attribute of the ProcessDumper class. --- src/aiida/cmdline/commands/cmd_process.py | 76 +++++------ src/aiida/tools/dumping/processes.py | 149 ++++++++++++++-------- tests/conftest.py | 2 +- tests/tools/dumping/test_processes.py | 28 ++-- 4 files changed, 147 insertions(+), 108 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 0f7cfb1998..940ecd4ffb 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -484,7 +484,7 @@ def process_repair(manager, broker, dry_run): @verdi_process.command('dump') -@arguments.PROCESS() +@arguments.PROCESSES() @options.PATH() @options.INCLUDE_INPUTS() @options.INCLUDE_ATTRIBUTES() @@ -492,7 +492,7 @@ def process_repair(manager, broker, dry_run): @options.OVERWRITE() @click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') def process_dump( - process, + processes, path, include_inputs, include_attributes, @@ -515,6 +515,7 @@ def process_dump( node data for further inspection. """ + from aiida.tools.dumping.processes import ( ProcessNodeYamlDumper, generate_default_dump_path, @@ -523,42 +524,43 @@ def process_dump( validate_make_dump_path, ) - # Generate default parent folder - if path is None: - output_path = generate_default_dump_path(process_node=process) - - # Capture `FileExistsError` here already, not by trying to run the dumping - try: - validate_make_dump_path(path=output_path, overwrite=overwrite) - except FileExistsError: - echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') + for process in processes: - processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) + # Generate default parent folder + if path is None: + output_path = generate_default_dump_path(process_node=process) - try: - process_node_dump( - process_node=process, - output_path=output_path, - include_inputs=include_inputs, - node_dumper=processnode_dumper, - overwrite=overwrite, - flat=flat, - ) + # Capture `FileExistsError` here already, not by trying to run the dumping + try: + validate_make_dump_path(path=output_path, overwrite=overwrite) + except FileExistsError: + echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') - echo.echo_success( - f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' - ) + processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - # ? Which exceptions do I expect here? - # except FileExistsError: - # # raise - # echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') - except NotImplementedError: - echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') - except Exception as e: - echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') - - # Create README in parent directory - # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and - # process_dump files such that they can also be used from within the Python API, not just via verdi - generate_dump_readme(output_path=output_path, process_node=process) + try: + process_node_dump( + process_node=process, + output_path=output_path, + include_inputs=include_inputs, + node_dumper=processnode_dumper, + overwrite=overwrite, + flat=flat, + ) + + echo.echo_success( + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' + ) + + # ? Which exceptions do I expect here? + except FileExistsError: + echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') + except NotImplementedError: + echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') + except Exception as e: + echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') + + # Create README in parent directory + # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and + # process_dump files such that they can also be used from within the Python API, not just via verdi + generate_dump_readme(output_path=output_path, process_node=process) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index beed798726..86e0f7996a 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -17,19 +17,28 @@ import yaml from aiida.common import LinkType -from aiida.common.exceptions import NotExistentAttributeError from aiida.orm import ( CalcFunctionNode, CalcJobNode, + CalculationNode, + FolderData, ProcessNode, + RemoteData, + SinglefileData, + UpfData, WorkChainNode, + WorkflowNode, WorkFunctionNode, ) from aiida.orm.utils import LinkTriple from aiida.repository import File _LOGGER = logging.getLogger(__name__) +FILE_NODES = (SinglefileData, FolderData, RemoteData, UpfData) +ALL_AIIDA_NODES = True +# todo: Add aiida_nodes as an optional input to cmd_process, and as a class attribute of the dumper that is passed +# todo: through, just like flat class ProcessNodeYamlDumper: """Utility class to dump selected `ProcessNode` properties and, optionally, attributes and extras to yaml.""" @@ -171,11 +180,11 @@ def generate_dump_readme(process_node: ProcessNode, output_path: Path): (output_path / 'README').write_text(_readme_string) -def generate_default_dump_path(process_node: WorkChainNode | CalcJobNode | CalcFunctionNode | WorkFunctionNode) -> Path: +def generate_default_dump_path(process_node: ProcessNode) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. - This function is not called for the sub-calls of `calcjob_dump` or during the recursive `process_dump` as it just - creates the default parent folder for the dumping, if no name is given. + This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as it + just creates the default parent folder for the dumping, if no name is given. :param process_node: The `ProcessNode` for which the directory is created. :return: The created parent dump path. @@ -239,36 +248,33 @@ def generate_node_input_label(index: int, link_triple: LinkTriple, flat: bool = link_label = link_triple.link_label # Generate directories with naming scheme akin to `verdi process status` - if not flat: - # node_label = f'{index:02d}-{link_label}' - label_list = [f'{index:02d}', link_label] + # node_label = f'{index:02d}-{link_label}' + label_list = [f'{index:02d}', link_label] - try: - process_label = node.process_label - if process_label is not None and process_label != link_label: - label_list += [process_label] - # node_label += f'-{process_label}' - - except AttributeError: - process_type = node.process_type - if process_type is not None and process_type != link_label: - label_list += [process_type] - # node_label += f'-{process_type}' + try: + process_label = node.process_label + if process_label is not None and process_label != link_label: + label_list += [process_label] + # node_label += f'-{process_label}' - else: - label_list = [] + except AttributeError: + process_type = node.process_type + if process_type is not None and process_type != link_label: + label_list += [process_type] + # node_label += f'-{process_type}' if isinstance(node, File): label_list += [node.name] node_label = '-'.join(label_list) - # `CALL-` as part of the link labels also for MultiplyAddWorkChain, so remove for generality + # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove node_label = node_label.replace('CALL-', '') return node_label def generate_calcjob_io_dump_paths(calcjob_io_dump_paths: list | None = None, flat: bool = False): + default_calcjob_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] if flat and calcjob_io_dump_paths is None: @@ -278,6 +284,7 @@ def generate_calcjob_io_dump_paths(calcjob_io_dump_paths: list | None = None, fl ) elif flat and calcjob_io_dump_paths is not None: _LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.') + calcjob_io_dump_paths = default_calcjob_io_dump_paths elif not flat and calcjob_io_dump_paths is None: _LOGGER.info( f'Flat set to False but no `io_dump_paths` provided. Will use the defaults {default_calcjob_io_dump_paths}.' @@ -288,12 +295,13 @@ def generate_calcjob_io_dump_paths(calcjob_io_dump_paths: list | None = None, fl _LOGGER.info( 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' ) + calcjob_io_dump_paths = ['', '', ''] return calcjob_io_dump_paths -def calcjob_node_dump( - calcjob_node: CalcJobNode | CalcFunctionNode, +def calculation_node_dump( + calcjob_node: CalculationNode, output_path: Path | None, include_inputs: bool = True, node_dumper: ProcessNodeYamlDumper | None = None, @@ -321,16 +329,19 @@ def calcjob_node_dump( io_dump_paths = generate_calcjob_io_dump_paths(calcjob_io_dump_paths=io_dump_paths, flat=flat) + # These are the raw_inputs calcjob_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) - try: - calcjob_node.outputs.retrieved.copy_tree(output_path.resolve() / io_dump_paths[1]) - except NotExistentAttributeError: - # Might not have an output with link label `retrieved` - pass - + # These are the node_inputs if include_inputs: - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=output_path / io_dump_paths[2], flat=flat) + calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=output_path / io_dump_paths[2], flat=flat) + + output_nodes = [calcjob_node.outputs[output] for output in calcjob_node.outputs] + for output_node in output_nodes: + if isinstance(output_node, FILE_NODES): + output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1]) + elif ALL_AIIDA_NODES: + output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1] / '.aiida_nodes') # This will eventually be replaced once pydantic backend PR merged if node_dumper is None: @@ -339,7 +350,7 @@ def calcjob_node_dump( def process_node_dump( - process_node: WorkChainNode | CalcJobNode | WorkFunctionNode | CalcFunctionNode, + process_node: ProcessNode, output_path: Path | None, include_inputs: bool = True, node_dumper: ProcessNodeYamlDumper | None = None, @@ -379,7 +390,7 @@ def process_node_dump( # `process_dump` function called by `verdi`, then I need to dump for a `CalcJob` here, as # well. Also, if I want to be able to use `process_dump` via the Python API if isinstance(process_node, (CalcFunctionNode, CalcJobNode)): - calcjob_node_dump( + calculation_node_dump( calcjob_node=process_node, output_path=output_path, include_inputs=include_inputs, @@ -388,7 +399,7 @@ def process_node_dump( flat=flat, ) - elif isinstance(process_node, (WorkChainNode, WorkFunctionNode)): + elif isinstance(process_node, WorkflowNode): called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() # If multiple CalcJobs contained in Workchain flat=True doesn't make sense as files would be overwritten @@ -403,16 +414,16 @@ def process_node_dump( for index, link_triple in enumerate(sorted_called_links, start=1): child_node = link_triple.node - if not flat: - child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) - else: - child_label = '' + # if not flat: + child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) + # else: + # child_label = '' child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkChainNode`` # Not sure if the next two cases work for `WorkFunction` and `CalcFuncion``Node`s - if isinstance(child_node, (WorkChainNode, WorkFunctionNode)): + if isinstance(child_node, WorkflowNode): process_node_dump( process_node=child_node, output_path=child_output_path, @@ -422,8 +433,8 @@ def process_node_dump( flat=flat, ) - elif isinstance(child_node, (CalcJobNode, CalcFunctionNode)): - calcjob_node_dump( + elif isinstance(child_node, CalculationNode): + calculation_node_dump( calcjob_node=child_node, output_path=child_output_path, include_inputs=include_inputs, @@ -432,28 +443,54 @@ def process_node_dump( flat=flat, ) - # todo: Add checks for `CalcFunctionNode` and `WorkFunctionNode` here specifically and implement the - # respective duming functions - -# Separate functions for CalcJob dumping using pre_submit, as well as for the node_inputs -def calcjob_node_inputs_dump(calcjob_node: CalcJobNode | CalcFunctionNode, output_path: Path, flat: bool = False): - """Dump inputs of a `CalcJobNode` of type `SinglefileData` and `FolderData`. +def calculation_node_inputs_dump(calculation_node: CalculationNode, output_path: Path, flat: bool = False): + """Dump inputs of a `CalcJobNode`. :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. :param output_path: The path where the inputs will be dumped. + :param flat: Dump node inputs in a flat directory structure. """ - input_node_triples = calcjob_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + input_node_triples = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) for input_node_triple in input_node_triples: # Select only repositories that actually hold objects - if len(input_node_triple.node.base.repository.list_objects()) > 0: - if not flat: - input_node_path = output_path / Path(*input_node_triple.link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - # ! This is not the issue why `source_file` is not dumped for MultiplyAddWorkChain when flat=True - input_node_path = output_path - - input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) + # todo: Could make this a separate function + # Here, the check for repository could also serve a + + input_node_path = generate_input_node_path(input_node_triple=input_node_triple, parent_path=output_path, flat=flat) + + input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) + + +def generate_input_node_path(input_node_triple, parent_path, flat, exception_labels: list | None = None): + + input_node = input_node_triple.node + link_label = input_node_triple.link_label + + if exception_labels is None: + exception_labels = ['pseudos'] + + if len(input_node.base.repository.list_objects()) > 0: + # Empty repository, so it should be standard AiiDA data types, like Int, Float, etc. + aiida_nodes_subdir = '' + else: + aiida_nodes_subdir = '.aiida_nodes' + + # ? The check if the link_label starts with pseudo is again very specific for the atomistic community/QE, + # ? however, I don't know how to otherwise avoid that it's put in `.aiida_nodes`, as the Node is defined as + # ? Data, not UpfData, so I cannot just check against FILE_NODES + if isinstance(input_node, FILE_NODES) or any(link_label.startswith(label) for label in exception_labels): + if not flat: + input_node_path = parent_path / Path(*link_label.split('__')) + else: + # Don't use link_label at all -> But, relative path inside FolderData is retained + input_node_path = parent_path + elif not flat: + input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) + else: + # Don't use link_label at all -> But, relative path inside FolderData is retained + input_node_path = parent_path / aiida_nodes_subdir + + return input_node_path.resolve() diff --git a/tests/conftest.py b/tests/conftest.py index 4efa836f6e..fc5dc16429 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -161,7 +161,7 @@ def generate_calculation_node(): """Generate an instance of a `CalculationNode`.""" from aiida.engine import ProcessState - def _generate_calculation_node(process_state=ProcessState.FINISHED, exit_status=None, entry_point=None): + def _generate_calculation_node(process_state=ProcessState.FINISHED, exit_status=None, entry_point=None, inputs: dict | None = None, outputs: dict | None = None, repository: str | Path = None): """Generate an instance of a `CalculationNode`.. :param process_state: state to set diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index e481e8d081..beccb7078d 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -19,8 +19,8 @@ import pytest from aiida.tools.dumping.processes import ( - calcjob_node_dump, - calcjob_node_inputs_dump, + calculation_node_dump, + calculation_node_inputs_dump, generate_default_dump_path, generate_node_input_label, process_node_dump, @@ -73,7 +73,7 @@ def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): # └── singlefile_input # └── file.txt - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=dump_parent_path) assert (dump_parent_path / singlefiledata_linklabel).is_dir() assert (dump_parent_path / singlefiledata_linklabel / filename).is_file() assert (dump_parent_path / folderdata_path).is_dir() @@ -96,7 +96,7 @@ def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): # └── relative_path # └── file.txt - calcjob_node_inputs_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) + calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=dump_parent_path, flat=True) # Flat=True doesn't flatten nested directory structure of FolderData objects -> Leave relative path assert (dump_parent_path / folderdata_internal_path).is_dir() @@ -139,7 +139,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs # └── file.txt - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) assert (dump_parent_path / default_dump_paths[0] / filename).is_file() assert (dump_parent_path / default_dump_paths[1] / filename).is_file() @@ -162,7 +162,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs_ # └── file.txt - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs # node_inputs, singlefile @@ -177,7 +177,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── relative_path # └── file.txt - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) assert not (dump_parent_path / default_dump_paths[0] / filename).is_file() # raw_inputs assert not (dump_parent_path / default_dump_paths[1] / filename).is_file() # raw_outputs assert not (dump_parent_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -202,7 +202,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test # todo: setup. This might be again too specific for QE? - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs assert (dump_parent_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -210,20 +210,20 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): clean_tmp_path(tmp_path=tmp_path) # Don't dump the connected node inputs for both, flat is True/False - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) # Check that it fails when it tries to create the same directory without overwrite=True - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) with pytest.raises(FileExistsError): - calcjob_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): @@ -233,7 +233,7 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme add_node = generate_arithmetic_add_node(computer=aiida_localhost) # Normal dumping of ArithmeticAddCalculation node - calcjob_node_dump(calcjob_node=add_node, output_path=dump_path) + calculation_node_dump(calcjob_node=add_node, output_path=dump_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] @@ -335,7 +335,7 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l process_node_dump(process_node=multiply_add_node, output_path=dump_parent_path, flat=True) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - # raw_input_files += ['source_file'] + raw_input_files += ['source_file'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] raw_input_files = [dump_parent_path / raw_input_file for raw_input_file in raw_input_files] raw_output_files = [dump_parent_path / raw_output_file for raw_output_file in raw_output_files] From a1930cb94e3b5f0a0c5c9b73a08c41e091703472 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Fri, 26 Apr 2024 20:55:56 +0200 Subject: [PATCH 15/30] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20First=20working=20OO?= =?UTF-8?q?P-version=20using=20`ProcessDumper`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid having to pass all the arguments `include_node_inputs`, `include_attributes/extras`, `overwrite`, `flat`, `all_aiida_nodes` through the different functions, everything related to the dumping is now compiled in the `ProcessDumper` class, which defines the main entry-point method `dump`. For nested workflows, this is recursively called (as before). Once `CalculationFunction` nodes are reached, their content is dumped via `dump_calculation_node`. The helper functions to create and validate labels and paths of nested subdirectories are also methods of the `ProcessDumper`. Introduced the `parent_process` class attribute which is dynamically generated from the parent_node, and which is used to generate the main README, which is only created when the dumping is done via the `verdi` CLI. For the other functions, this concept does not make sense, due to the recursion, so the respective `process_node`s (which are changing during the recursion) are always passed as arguments. Next steps: - Update tests to actually test the new implementations - Update docstrings - Add section to `How to work with data` section of the docs - If the `OverridableOptions` are only used here, they can also just be defined as normal `click` options (however, we can also start thinking about the `verdi archive dump` functionality that we should start implementing soon) --- src/aiida/cmdline/commands/cmd_process.py | 52 +- src/aiida/cmdline/params/options/main.py | 4 +- src/aiida/tools/dumping/processes.py | 692 ++++++++++------------ tests/conftest.py | 9 +- tests/tools/dumping/test_processes.py | 20 +- 5 files changed, 370 insertions(+), 407 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 940ecd4ffb..2d2979dc5b 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -491,12 +491,14 @@ def process_repair(manager, broker, dry_run): @options.INCLUDE_EXTRAS() @options.OVERWRITE() @click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') +@click.option('-a', '--all-aiida-nodes', is_flag=True, default=False, help='Dump all non file-based AiiDA nodes.') def process_dump( processes, path, include_inputs, include_attributes, include_extras, + all_aiida_nodes, overwrite, flat, ) -> None: @@ -516,41 +518,34 @@ def process_dump( """ - from aiida.tools.dumping.processes import ( - ProcessNodeYamlDumper, - generate_default_dump_path, - generate_dump_readme, - process_node_dump, - validate_make_dump_path, - ) + from aiida.tools.dumping.processes import ProcessDumper for process in processes: # Generate default parent folder + process_dumper = ProcessDumper( + parent_process=process, + include_node_inputs=include_inputs, + include_attributes=include_attributes, + include_extras=include_extras, + all_aiida_nodes=all_aiida_nodes, + overwrite=overwrite, + flat=flat, + ) + if path is None: - output_path = generate_default_dump_path(process_node=process) + output_path = process_dumper.generate_default_dump_path(process_node=process) + else: + output_path = path.resolve() # Capture `FileExistsError` here already, not by trying to run the dumping try: - validate_make_dump_path(path=output_path, overwrite=overwrite) + process_dumper.dump_path_validate_make(validate_path=output_path) except FileExistsError: echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') - processnode_dumper = ProcessNodeYamlDumper(include_attributes=include_attributes, include_extras=include_extras) - try: - process_node_dump( - process_node=process, - output_path=output_path, - include_inputs=include_inputs, - node_dumper=processnode_dumper, - overwrite=overwrite, - flat=flat, - ) - - echo.echo_success( - f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' - ) + process_dumper.dump(process_node=process, output_path=output_path) # ? Which exceptions do I expect here? except FileExistsError: @@ -560,7 +555,10 @@ def process_dump( except Exception as e: echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') - # Create README in parent directory - # Done after dumping, so that dumping directory is there. Dumping directory is created within the calcjob_dump and - # process_dump files such that they can also be used from within the Python API, not just via verdi - generate_dump_readme(output_path=output_path, process_node=process) + echo.echo_success( + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' + ) + + # Create README in parent directory. Do this at the end as to not cause exceptions for the path creation, and + # only do it when everything ran through fine before + process_dumper.generate_parent_readme() diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 6b9fb4b516..14c27d0471 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -767,7 +767,7 @@ def set_log_level(_ctx, _param, value): ) INCLUDE_ATTRIBUTES = OverridableOption( - '-a', + # '-a', '--include-attributes', is_flag=True, default=False, @@ -776,7 +776,7 @@ def set_log_level(_ctx, _param, value): ) INCLUDE_EXTRAS = OverridableOption( - '-e', + # '-e', '--include-extras', is_flag=True, default=False, diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 86e0f7996a..444b2124c9 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -11,7 +11,6 @@ from __future__ import annotations import logging -import textwrap from pathlib import Path import yaml @@ -25,7 +24,6 @@ ProcessNode, RemoteData, SinglefileData, - UpfData, WorkChainNode, WorkflowNode, WorkFunctionNode, @@ -33,37 +31,151 @@ from aiida.orm.utils import LinkTriple from aiida.repository import File -_LOGGER = logging.getLogger(__name__) -FILE_NODES = (SinglefileData, FolderData, RemoteData, UpfData) -ALL_AIIDA_NODES = True +FILE_NODES = (SinglefileData, FolderData, RemoteData) -# todo: Add aiida_nodes as an optional input to cmd_process, and as a class attribute of the dumper that is passed -# todo: through, just like flat -class ProcessNodeYamlDumper: - """Utility class to dump selected `ProcessNode` properties and, optionally, attributes and extras to yaml.""" +class ProcessDumper: + def __init__( + self, + parent_process: ProcessNode, + include_node_inputs: bool = True, + include_attributes: bool = True, + include_extras: bool = True, + overwrite: bool = True, + flat: bool = False, + all_aiida_nodes: bool = False, + ) -> None: + self.parent_process = parent_process + self.include_node_inputs = include_node_inputs + self.include_attributes = include_attributes + self.include_extras = include_extras + self.overwrite = overwrite + self.flat = flat + self.all_aiida_nodes = all_aiida_nodes + + self.logger = logging.getLogger(__name__) - NODE_PROPERTIES = [ - 'label', - 'description', - 'pk', - 'uuid', - 'ctime', - 'mtime', - 'node_type', - 'process_type', - 'is_finished_ok', - ] + self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) - USER_PROPERTIES = ('first_name', 'last_name', 'email', 'institution') + def dump( + self, + process_node, # ? This changes, so it should not be defined as a class variable + output_path: Path | None, + io_dump_paths: list | None = None, + ): + """Dumps all data involved in a `WorkChainNode`, including its outgoing links. + + Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are + only actually created when a `CalcJobNode` is reached. + + :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. + :param output_path: The main output path where the directory tree will be created. + :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. + :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new + instance will be created. Defaults to None. + """ - COMPUTER_PROPERTIES = ('label', 'hostname', 'scheduler_type', 'transport_type') + if output_path is None: + output_path = self.generate_default_dump_path(process_node=process_node) - def __init__(self, include_attributes: bool = True, include_extras: bool = True): - self.include_attributes = include_attributes - self.include_extras = include_extras + try: + self.dump_path_validate_make(validate_path=output_path) + except: + raise + + # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive + # `dump` function called by `verdi`, then I need to dump for the `CalcFunction` here already, as well. + self.dump_node_yaml(process_node=process_node, output_path=output_path) + if isinstance(process_node, CalcFunctionNode): + self.dump_calculation_node( + calculation_node=process_node, + output_path=output_path, + include_inputs=self.include_node_inputs, + io_dump_paths=io_dump_paths, + ) + + elif isinstance(process_node, WorkflowNode): + called_links = process_node.base.links.get_outgoing( + link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK) + ).all() + + sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) + + for index, link_triple in enumerate(sorted_called_links, start=1): + child_node = link_triple.node + child_label = self.generate_calcjob_input_node_label(index=index, link_triple=link_triple) + child_output_path = output_path.resolve() / child_label + + # Recursive function call for `WorkFlowNode`` + if isinstance(child_node, WorkflowNode): + self.dump( + process_node=child_node, + output_path=child_output_path, + ) + + # Once a `CalculationNode` as child reached, dump it + elif isinstance(child_node, CalculationNode): + self.dump_calculation_node( + calculation_node=child_node, + output_path=child_output_path, + ) + + def dump_calculation_node( + self, + calculation_node: CalculationNode, + output_path: Path | None, + io_dump_paths: list | None = None, + ): + """ + Dump the contents of a CalcJobNode to a specified output path. + + :param calcjob_node: The CalcJobNode to be dumped. + :param output_path: The path where the dumped contents will be stored. + :param include_inputs: If True, do not dump the inputs of the CalcJobNode. + :return: None + """ + + if output_path is None: + output_path = self.generate_default_dump_path(process_node=calculation_node) + + try: + self.dump_path_validate_make(validate_path=output_path) + except: + # raise same exception here to communicate it outwards + raise + + if io_dump_paths is None: + io_dump_paths = self.generate_calculation_io_dump_paths(calculation_io_dump_paths=io_dump_paths) + + # Dump the raw_inputs + # ? Rename this to node_repository or something -> Introduces AiiDA terminology.But as we provide the option to + # ? dump *all* the outputs, we should also provide the option to dump *all* the inputs, not just `node_inputs` + calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) + + # Dump the raw_outputs + output_nodes = [calculation_node.outputs[output] for output in calculation_node.outputs] + for output_node in output_nodes: + if isinstance(output_node, FILE_NODES): + output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1]) + elif self.all_aiida_nodes: + output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1] / '.aiida_nodes') + + # Dump the node_inputs + if self.include_node_inputs: + input_node_triples = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + + for input_node_triple in input_node_triples: + input_node_path = self.generate_calculation_input_node_path( + input_node_triple=input_node_triple, + parent_path=output_path / io_dump_paths[2], + ) + + # No .resolve() required as that done in `generate_calcjob_input_node_path` + input_node_triple.node.base.repository.copy_tree(input_node_path) - def dump_yaml( + self.dump_node_yaml(process_node=calculation_node, output_path=output_path) + + def dump_node_yaml( self, process_node: ProcessNode, output_path: Path, @@ -77,11 +189,27 @@ def dump_yaml( :return: None """ + _node_properties = [ + 'label', + 'description', + 'pk', + 'uuid', + 'ctime', + 'mtime', + 'node_type', + 'process_type', + 'is_finished_ok', + ] + + _user_properties = ('first_name', 'last_name', 'email', 'institution') + + _computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') + node_dict = {} metadata_dict = {} # Add actual node `@property`s to dictionary - for metadata_property in self.NODE_PROPERTIES: + for metadata_property in _node_properties: metadata_dict[metadata_property] = getattr(process_node, metadata_property) node_dict['Node data'] = metadata_dict @@ -90,7 +218,7 @@ def dump_yaml( try: node_dbuser = process_node.user user_dict = {} - for user_property in self.USER_PROPERTIES: + for user_property in _user_properties: user_dict[user_property] = getattr(node_dbuser, user_property) node_dict['User data'] = user_dict except AttributeError: @@ -100,7 +228,7 @@ def dump_yaml( try: node_dbcomputer = process_node.computer computer_dict = {} - for computer_property in self.COMPUTER_PROPERTIES: + for computer_property in _computer_properties: computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) node_dict['Computer data'] = computer_dict except AttributeError: @@ -121,376 +249,206 @@ def dump_yaml( with open(output_file, 'w') as handle: yaml.dump(node_dict, handle, sort_keys=False) + def dump_path_validate_make(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: + """ + Create default dumping directory for a given process node and return it as absolute path. -# Utility functions -def generate_dump_readme(process_node: ProcessNode, output_path: Path): - """Generate README file in main dumping directory. - - :param process_node: CalcJob or WorkChain Node. - :param output_path: Output path for dumping. - - """ - - from aiida.cmdline.utils.ascii_vis import format_call_graph - from aiida.cmdline.utils.common import ( - get_calcjob_report, - get_node_info, - get_process_function_report, - get_workchain_report, - ) - - _readme_string = textwrap.dedent( - f"""\ - This directory contains the files involved in the simulation/workflow `{process_node.process_label} <{process_node.pk}>` run with AiiDA. - - Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are - contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus - mirrors the logical execution of the workflow, which can also be queried by running `verdi process status - {process_node.pk}` on the command line. - - By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". - - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA - node data for further inspection.""" # noqa: E501 - ) - - # `verdi process status` - process_status = format_call_graph(calc_node=process_node, max_depth=None, call_link_label=True) - _readme_string += f'\n\nOutput of `verdi process status`\n\n{process_status}' - - # `verdi process report` - # Copied over from `cmd_process` - if isinstance(process_node, CalcJobNode): - process_report = get_calcjob_report(process_node) - elif isinstance(process_node, WorkChainNode): - process_report = get_workchain_report(process_node, levelname='REPORT', indent_size=2, max_depth=None) - elif isinstance(process_node, (CalcFunctionNode, WorkFunctionNode)): - process_report = get_process_function_report(process_node) - else: - process_report = f'Nothing to show for node type {process_node.__class__}' - - _readme_string += f'\n\nOutput of `verdi process report`\n\n{process_report}' - - # `verdi process show`? - process_show = get_node_info(node=process_node) - _readme_string += f'\n\nOutput of `verdi process show`\n\n{process_show}' - - (output_path / 'README').write_text(_readme_string) - - -def generate_default_dump_path(process_node: ProcessNode) -> Path: - """Simple helper function to generate the default parent-dumping directory if none given. - - This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as it - just creates the default parent folder for the dumping, if no name is given. - - :param process_node: The `ProcessNode` for which the directory is created. - :return: The created parent dump path. - """ - - try: - return Path(f'dump-{process_node.process_label}-{process_node.pk}') - except AttributeError: - # ? This case came up during testing, not sure how relevant it actually is - return Path(f'dump-{process_node.process_type}-{process_node.pk}') - - -# ? Could move this to `cmdline/utils` -def validate_make_dump_path( - path: Path, overwrite: bool = False, safeguard_file: str = '.aiida_node_metadata.yaml' -) -> Path: - """ - Create default dumping directory for a given process node and return it as absolute path. - - :param path: The base path for the dump. Defaults to the current directory. - :return: The created dump path. - """ - import shutil + :param path: The base path for the dump. Defaults to the current directory. + :return: The created dump path. + """ + import shutil + + if validate_path.is_dir(): + # Existing, empty directory -> OK + if not any(validate_path.iterdir()): + pass + + # Existing, non-empty directory and overwrite False -> FileExistsError + elif not self.overwrite: + raise FileExistsError(f'Path `{validate_path}` already exists and overwrite set to False.') + + # Existing, non-empty directory and overwrite True + # Check for safeguard file ('.aiida_node_metadata.yaml') for safety + # If present -> Remove directory + elif (validate_path / safeguard_file).is_file(): + self.logger.info(f'Overwrite set to true, will overwrite directory `{validate_path}`.') + shutil.rmtree(validate_path) + + # Existing and non-empty directory and overwrite True + # Check for safeguard file ('.aiida_node_metadata.yaml') for safety + # If absent -> Don't remove directory as to not accidentally remove a wrong one + else: + raise FileExistsError( + f"Path `{validate_path}` already exists and doesn't contain safeguard file {safeguard_file}." + f'Not removing for safety reasons.' + ) - if path.is_dir(): - # Existing, but empty directory => OK - if not any(path.iterdir()): - pass + # Not included in if-else as to avoid having to repeat the `mkdir` call. + # `exist_ok=True` as checks implemented above + validate_path.mkdir(exist_ok=True, parents=True) - # Existing and non-empty directory and overwrite False => FileExistsError - elif not overwrite: - raise FileExistsError(f'Path `{path}` already exists and overwrite set to False.') + return validate_path.resolve() - # Existing and non-empty directory and overwrite True => Check for '.aiida_node_metadata.yaml' for safety - # '.aiida_node_metadata.yaml' present => Remove directory - elif (path / safeguard_file).is_file(): - _LOGGER.info(f'Overwrite set to true, will overwrite directory `{path}`.') - shutil.rmtree(path) - path.mkdir(parents=True, exist_ok=False) + def generate_calculation_io_dump_paths(self, calculation_io_dump_paths: list | None = None): + default_calculation_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] + empty_calculation_io_dump_paths = ['', '', ''] - # Existing and non-empty directory and overwrite True => Check for safeguard_file (e.g. - # '.aiida_node_metadata.yaml') for safety reasons (don't wont to recursively delete wrong directory...) + if self.flat and calculation_io_dump_paths is None: + self.logger.info( + 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' + ) + return empty_calculation_io_dump_paths + elif self.flat and calculation_io_dump_paths is not None: + self.logger.info( + 'Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.' + ) + return default_calculation_io_dump_paths + elif not self.flat and calculation_io_dump_paths is None: + self.logger.info( + f'Flat set to False but no `io_dump_paths` provided. Will use the defaults {default_calculation_io_dump_paths}.' + ) + return default_calculation_io_dump_paths else: - # _LOGGER.critical( - # f"`{path}` Path exists but no `.aiida_node_metadata.yaml` found. Won't delete for security.\n" - # # f'Manually remove existing `{path}` and dump again.' - # ) - raise FileExistsError( - f"Path `{path}` already exists and doesn't contain `.aiida_node_metadata.yaml`. Not removing for safety reasons." + self.logger.info( + 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' ) + return empty_calculation_io_dump_paths - # Not included in else as to avoid having to repeat the `mkdir` call. `exist_ok=True` as checks implemented above - path.mkdir(exist_ok=True, parents=True) - - return path.resolve() - + def generate_calculation_input_node_path( + self, input_node_triple, parent_path, exception_labels: list | None = None + ) -> Path: + input_node = input_node_triple.node + link_label = input_node_triple.link_label -def generate_node_input_label(index: int, link_triple: LinkTriple, flat: bool = False) -> str: - """Small helper function to generate the directory label for node inputs.""" - node = link_triple.node - link_label = link_triple.link_label + if exception_labels is None: + exception_labels = ['pseudos'] - # Generate directories with naming scheme akin to `verdi process status` - # node_label = f'{index:02d}-{link_label}' - label_list = [f'{index:02d}', link_label] + if len(input_node.base.repository.list_objects()) > 0: + # Empty repository, so it should be standard AiiDA data types, like Int, Float, etc. + aiida_nodes_subdir = '' + else: + aiida_nodes_subdir = '.aiida_nodes' + + # ? The check if the link_label starts with pseudo is again very specific for the atomistic community/QE, + # ? however, I don't know how to otherwise avoid that it's put in `.aiida_nodes`, as the Node is defined as + # ? Data, not UpfData, so I cannot just check against FILE_NODES + if isinstance(input_node, FILE_NODES) or any(link_label.startswith(label) for label in exception_labels): + if not self.flat: + input_node_path = parent_path / Path(*link_label.split('__')) + else: + # Don't use link_label at all -> But, relative path inside FolderData is retained + input_node_path = parent_path + elif not self.flat: + input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) + else: + # Don't use link_label at all -> But, relative path inside FolderData is retained + input_node_path = parent_path / aiida_nodes_subdir - try: - process_label = node.process_label - if process_label is not None and process_label != link_label: - label_list += [process_label] - # node_label += f'-{process_label}' + return input_node_path.resolve() - except AttributeError: - process_type = node.process_type - if process_type is not None and process_type != link_label: - label_list += [process_type] - # node_label += f'-{process_type}' + def generate_calcjob_input_node_label(self, index: int, link_triple: LinkTriple) -> str: + """Small helper function to generate the directory label for node inputs.""" + node = link_triple.node + link_label = link_triple.link_label - if isinstance(node, File): - label_list += [node.name] + # Generate directories with naming scheme akin to `verdi process status` + # node_label = f'{index:02d}-{link_label}' + label_list = [f'{index:02d}', link_label] - node_label = '-'.join(label_list) - # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove - node_label = node_label.replace('CALL-', '') + try: + process_label = node.process_label + if process_label is not None and process_label != link_label: + label_list += [process_label] + # node_label += f'-{process_label}' - return node_label + except AttributeError: + process_type = node.process_type + if process_type is not None and process_type != link_label: + label_list += [process_type] + # node_label += f'-{process_type}' + if isinstance(node, File): + label_list += [node.name] -def generate_calcjob_io_dump_paths(calcjob_io_dump_paths: list | None = None, flat: bool = False): + node_label = '-'.join(label_list) + # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove + node_label = node_label.replace('CALL-', '') - default_calcjob_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] + return node_label - if flat and calcjob_io_dump_paths is None: - calcjob_io_dump_paths = ['', '', ''] - _LOGGER.info( - 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' - ) - elif flat and calcjob_io_dump_paths is not None: - _LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.') - calcjob_io_dump_paths = default_calcjob_io_dump_paths - elif not flat and calcjob_io_dump_paths is None: - _LOGGER.info( - f'Flat set to False but no `io_dump_paths` provided. Will use the defaults {default_calcjob_io_dump_paths}.' - ) - calcjob_io_dump_paths = default_calcjob_io_dump_paths - # elif not flat and calcjob_io_dump_paths is not None: - else: - _LOGGER.info( - 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' - ) - calcjob_io_dump_paths = ['', '', ''] - - return calcjob_io_dump_paths - - -def calculation_node_dump( - calcjob_node: CalculationNode, - output_path: Path | None, - include_inputs: bool = True, - node_dumper: ProcessNodeYamlDumper | None = None, - overwrite: bool = True, - flat: bool = False, - io_dump_paths: list | None = None, -): - """ - Dump the contents of a CalcJobNode to a specified output path. - - :param calcjob_node: The CalcJobNode to be dumped. - :param output_path: The path where the dumped contents will be stored. - :param include_inputs: If True, do not dump the inputs of the CalcJobNode. - :return: None - """ - - if output_path is None: - output_path = generate_default_dump_path(process_node=calcjob_node) - - try: - validate_make_dump_path(path=output_path, overwrite=overwrite) - except: - # raise same exception here to communicate it outwards - raise - - io_dump_paths = generate_calcjob_io_dump_paths(calcjob_io_dump_paths=io_dump_paths, flat=flat) - - # These are the raw_inputs - calcjob_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) - - # These are the node_inputs - if include_inputs: - calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=output_path / io_dump_paths[2], flat=flat) - - output_nodes = [calcjob_node.outputs[output] for output in calcjob_node.outputs] - for output_node in output_nodes: - if isinstance(output_node, FILE_NODES): - output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1]) - elif ALL_AIIDA_NODES: - output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1] / '.aiida_nodes') + def generate_default_dump_path(self, process_node: ProcessNode) -> Path: + """Simple helper function to generate the default parent-dumping directory if none given. - # This will eventually be replaced once pydantic backend PR merged - if node_dumper is None: - node_dumper = ProcessNodeYamlDumper() - node_dumper.dump_yaml(process_node=calcjob_node, output_path=output_path) - - -def process_node_dump( - process_node: ProcessNode, - output_path: Path | None, - include_inputs: bool = True, - node_dumper: ProcessNodeYamlDumper | None = None, - overwrite: bool = True, - flat: bool = False, -): - """Dumps all data involved in a `WorkChainNode`, including its outgoing links. - - Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are - only actually created when a `CalcJobNode` is reached. - - :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. - :param output_path: The main output path where the directory tree will be created. - :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. - :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new - instance will be created. Defaults to None. - """ - - # Realized during testing: If no path provided, only for the sub-workchain an additional directory is created, but - # should also create one here, if the function is imported normally and used in Python scripts - if output_path is None: - output_path = generate_default_dump_path(process_node=process_node) - - try: - validate_make_dump_path(path=output_path, overwrite=overwrite) - except: - raise - - # This will eventually be replaced once pydantic backend PR merged - if node_dumper is None: - node_dumper = ProcessNodeYamlDumper() - - # Need to dump for parent ProcessNode, as well, otherwise no metadata file in parent ProcessNode directory - node_dumper.dump_yaml(process_node=process_node, output_path=output_path.resolve()) - - # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive - # `process_dump` function called by `verdi`, then I need to dump for a `CalcJob` here, as - # well. Also, if I want to be able to use `process_dump` via the Python API - if isinstance(process_node, (CalcFunctionNode, CalcJobNode)): - calculation_node_dump( - calcjob_node=process_node, - output_path=output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - flat=flat, - ) + This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as it + just creates the default parent folder for the dumping, if no name is given. - elif isinstance(process_node, WorkflowNode): - called_links = process_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() - - # If multiple CalcJobs contained in Workchain flat=True doesn't make sense as files would be overwritten - # -> Well, if different CalcJobs run, it could still make sense, but would one really want all these files in - # one flat directory? - called_calcjobs = [isinstance(node, CalcJobNode) for node in process_node.called_descendants] - if flat and called_calcjobs.count(True) > 1: - # Add error message here or when capturing `NotImplementedError` - raise NotImplementedError - - sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) - - for index, link_triple in enumerate(sorted_called_links, start=1): - child_node = link_triple.node - # if not flat: - child_label = generate_node_input_label(index=index, link_triple=link_triple, flat=flat) - # else: - # child_label = '' - - child_output_path = output_path.resolve() / child_label - - # Recursive function call for `WorkChainNode`` - # Not sure if the next two cases work for `WorkFunction` and `CalcFuncion``Node`s - if isinstance(child_node, WorkflowNode): - process_node_dump( - process_node=child_node, - output_path=child_output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - flat=flat, - ) + :param process_node: The `ProcessNode` for which the directory is created. + :return: The created parent dump path. + """ - elif isinstance(child_node, CalculationNode): - calculation_node_dump( - calcjob_node=child_node, - output_path=child_output_path, - include_inputs=include_inputs, - node_dumper=node_dumper, - overwrite=overwrite, - flat=flat, - ) + try: + return Path(f'dump-{process_node.process_label}-{process_node.pk}').resolve() + except AttributeError: + # ? This case came up during testing, not sure how relevant it actually is + return Path(f'dump-{process_node.process_type}-{process_node.pk}').resolve() + # ? Add type hints here? Would require loading from ORM in header of `cmd_` file -> Might fail CLI time validation + def generate_parent_readme(self): + """Generate README file in main dumping directory. -def calculation_node_inputs_dump(calculation_node: CalculationNode, output_path: Path, flat: bool = False): - """Dump inputs of a `CalcJobNode`. + :param process_node: CalcJob or WorkChain Node. + :param output_path: Output path for dumping. - :param calcjob_node: The `CalcJobNode` whose inputs will be dumped. - :param output_path: The path where the inputs will be dumped. - :param flat: Dump node inputs in a flat directory structure. - """ + """ - input_node_triples = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + import textwrap - for input_node_triple in input_node_triples: - # Select only repositories that actually hold objects - # todo: Could make this a separate function - # Here, the check for repository could also serve a + from aiida.cmdline.utils.ascii_vis import format_call_graph + from aiida.cmdline.utils.common import ( + get_calcjob_report, + get_node_info, + get_process_function_report, + get_workchain_report, + ) - input_node_path = generate_input_node_path(input_node_triple=input_node_triple, parent_path=output_path, flat=flat) + _readme_string = textwrap.dedent( + f"""\ + This directory contains the files involved in the simulation/workflow `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. - input_node_triple.node.base.repository.copy_tree(input_node_path.resolve()) + Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are + contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus + mirrors the logical execution of the workflow, which can also be queried by running `verdi process status + {self.parent_process.pk}` on the command line. + By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution + settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". -def generate_input_node_path(input_node_triple, parent_path, flat, exception_labels: list | None = None): + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA + node data for further inspection.""" # noqa: E501 + ) - input_node = input_node_triple.node - link_label = input_node_triple.link_label + # `verdi process status` + process_status = format_call_graph(calc_node=self.parent_process, max_depth=None, call_link_label=True) + _readme_string += f'\n\nOutput of `verdi process status`\n\n{process_status}' + + # `verdi process report` + # Copied over from `cmd_process` + if isinstance(self.parent_process, CalcJobNode): + process_report = get_calcjob_report(self.parent_process) + elif isinstance(self.parent_process, WorkChainNode): + process_report = get_workchain_report( + self.parent_process, levelname='REPORT', indent_size=2, max_depth=None + ) + elif isinstance(self.parent_process, (CalcFunctionNode, WorkFunctionNode)): + process_report = get_process_function_report(self.parent_process) + else: + process_report = f'Nothing to show for node type {self.parent_process.__class__}' - if exception_labels is None: - exception_labels = ['pseudos'] + _readme_string += f'\n\nOutput of `verdi process report`\n\n{process_report}' - if len(input_node.base.repository.list_objects()) > 0: - # Empty repository, so it should be standard AiiDA data types, like Int, Float, etc. - aiida_nodes_subdir = '' - else: - aiida_nodes_subdir = '.aiida_nodes' + # `verdi process show`? + process_show = get_node_info(node=self.parent_process) + _readme_string += f'\n\nOutput of `verdi process show`\n\n{process_show}' - # ? The check if the link_label starts with pseudo is again very specific for the atomistic community/QE, - # ? however, I don't know how to otherwise avoid that it's put in `.aiida_nodes`, as the Node is defined as - # ? Data, not UpfData, so I cannot just check against FILE_NODES - if isinstance(input_node, FILE_NODES) or any(link_label.startswith(label) for label in exception_labels): - if not flat: - input_node_path = parent_path / Path(*link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - input_node_path = parent_path - elif not flat: - input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - input_node_path = parent_path / aiida_nodes_subdir - - return input_node_path.resolve() + (self.parent_path / 'README').write_text(_readme_string) diff --git a/tests/conftest.py b/tests/conftest.py index fc5dc16429..c06a8e540b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -161,7 +161,14 @@ def generate_calculation_node(): """Generate an instance of a `CalculationNode`.""" from aiida.engine import ProcessState - def _generate_calculation_node(process_state=ProcessState.FINISHED, exit_status=None, entry_point=None, inputs: dict | None = None, outputs: dict | None = None, repository: str | Path = None): + def _generate_calculation_node( + process_state=ProcessState.FINISHED, + exit_status=None, + entry_point=None, + inputs: dict | None = None, + outputs: dict | None = None, + repository: str | pathlib.Path = None, + ): """Generate an instance of a `CalculationNode`.. :param process_state: state to set diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index beccb7078d..c4a46e5ef3 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -19,7 +19,7 @@ import pytest from aiida.tools.dumping.processes import ( - calculation_node_dump, + _calculation_node_dump, calculation_node_inputs_dump, generate_default_dump_path, generate_node_input_label, @@ -139,7 +139,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs # └── file.txt - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) assert (dump_parent_path / default_dump_paths[0] / filename).is_file() assert (dump_parent_path / default_dump_paths[1] / filename).is_file() @@ -162,7 +162,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── raw_outputs_ # └── file.txt - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs # node_inputs, singlefile @@ -177,7 +177,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # └── relative_path # └── file.txt - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) assert not (dump_parent_path / default_dump_paths[0] / filename).is_file() # raw_inputs assert not (dump_parent_path / default_dump_paths[1] / filename).is_file() # raw_outputs assert not (dump_parent_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -202,7 +202,7 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test # todo: setup. This might be again too specific for QE? - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs assert (dump_parent_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile @@ -210,20 +210,20 @@ def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): clean_tmp_path(tmp_path=tmp_path) # Don't dump the connected node inputs for both, flat is True/False - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) assert not (dump_parent_path / custom_dump_paths[2]).is_dir() clean_tmp_path(tmp_path=tmp_path) # Check that it fails when it tries to create the same directory without overwrite=True - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) with pytest.raises(FileExistsError): - calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): @@ -233,7 +233,7 @@ def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithme add_node = generate_arithmetic_add_node(computer=aiida_localhost) # Normal dumping of ArithmeticAddCalculation node - calculation_node_dump(calcjob_node=add_node, output_path=dump_path) + _calculation_node_dump(calcjob_node=add_node, output_path=dump_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] From f7a3f00ea91fe4221d910abe5f983d89b0a96fcb Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 29 Apr 2024 11:41:08 +0200 Subject: [PATCH 16/30] All functionality of the Python API fully tested --- src/aiida/cmdline/commands/cmd_process.py | 41 +- src/aiida/cmdline/params/options/main.py | 12 +- src/aiida/tools/dumping/processes.py | 382 ++++++----- tests/cmdline/commands/test_process.py | 58 +- tests/conftest.py | 150 ++--- tests/test_conftest.py | 4 + tests/tools/dumping/test_processes.py | 774 +++++++++++++--------- 7 files changed, 802 insertions(+), 619 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 2d2979dc5b..0e0eaf287e 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -486,12 +486,25 @@ def process_repair(manager, broker, dry_run): @verdi_process.command('dump') @arguments.PROCESSES() @options.PATH() +@click.option( + '-f', + '--flat', + 'flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of the workflow.', +) +@click.option( + '-a', + '--all-aiida-nodes', + is_flag=True, + default=False, + help='Dump also non file-based AiiDA nodes. This can generate quite a lot of files, so use with caution.', +) +@options.OVERWRITE() @options.INCLUDE_INPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() -@options.OVERWRITE() -@click.option('-f', '--flat', 'flat', is_flag=True, default=False, help='Dump all the files in one location.') -@click.option('-a', '--all-aiida-nodes', is_flag=True, default=False, help='Dump all non file-based AiiDA nodes.') def process_dump( processes, path, @@ -509,19 +522,17 @@ def process_dump( mirrors the logical execution of the workflow, which can also be queried by running `verdi process status ` on the command line. - By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + By default, input and output files of each calculation can be found in the corresponding "raw_inputs" and "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". + settings). Additional input files (depending on the type of calculation) are placed in the "extra_inputs". Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection. """ - from aiida.tools.dumping.processes import ProcessDumper for process in processes: - # Generate default parent folder process_dumper = ProcessDumper( parent_process=process, @@ -538,9 +549,11 @@ def process_dump( else: output_path = path.resolve() + process_dumper.parent_path = output_path + # Capture `FileExistsError` here already, not by trying to run the dumping try: - process_dumper.dump_path_validate_make(validate_path=output_path) + process_dumper.validate_make_dump_path(validate_path=output_path) except FileExistsError: echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') @@ -550,15 +563,15 @@ def process_dump( # ? Which exceptions do I expect here? except FileExistsError: echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') - except NotImplementedError: - echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') + # except NotImplementedError: + # echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') except Exception as e: echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') - echo.echo_success( - f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' - ) - # Create README in parent directory. Do this at the end as to not cause exceptions for the path creation, and # only do it when everything ran through fine before process_dumper.generate_parent_readme() + + echo.echo_success( + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' + ) diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 14c27d0471..74b6cf2ab1 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -756,7 +756,7 @@ def set_log_level(_ctx, _param, value): '--path', type=click.Path(path_type=pathlib.Path), show_default=False, - help='The directory in which the dumping folder will be created.', + help='Parent directory for the dumping.', ) INCLUDE_INPUTS = OverridableOption( @@ -768,18 +768,16 @@ def set_log_level(_ctx, _param, value): INCLUDE_ATTRIBUTES = OverridableOption( # '-a', - '--include-attributes', - is_flag=True, - default=False, + '--include-attributes/--exclude-attributes', + default=True, show_default=True, help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', ) INCLUDE_EXTRAS = OverridableOption( # '-e', - '--include-extras', - is_flag=True, - default=False, + '--include-extras/--exclude-extras', + default=True, show_default=True, help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', ) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 444b2124c9..0b7cf09f94 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -12,6 +12,7 @@ import logging from pathlib import Path +from typing import Any, List, Optional import yaml @@ -22,22 +23,23 @@ CalculationNode, FolderData, ProcessNode, - RemoteData, SinglefileData, WorkChainNode, WorkflowNode, WorkFunctionNode, ) -from aiida.orm.utils import LinkTriple -from aiida.repository import File +from aiida.orm.utils import LinkManager, LinkTriple -FILE_NODES = (SinglefileData, FolderData, RemoteData) +# Include RemoteData here? -> Don't think so +FILE_CLASSES = (SinglefileData, FolderData) +LOGGER = logging.getLogger(__name__) class ProcessDumper: def __init__( self, - parent_process: ProcessNode, + parent_process: Optional[ProcessNode] = None, + parent_path: Optional[Path] = None, include_node_inputs: bool = True, include_attributes: bool = True, include_extras: bool = True, @@ -53,9 +55,11 @@ def __init__( self.flat = flat self.all_aiida_nodes = all_aiida_nodes - self.logger = logging.getLogger(__name__) - - self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) + # Automatically determine parent_path on instantiation if `parent_process` is set + if parent_path is None and parent_process is not None: + self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) + elif parent_path is not None: + self.parent_path = parent_path def dump( self, @@ -68,7 +72,7 @@ def dump( Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are only actually created when a `CalcJobNode` is reached. - :param process_node: The parent process node to be dumped. It can be either a `WorkChainNode` or a `CalcJobNode`. + :param process_node: The parent process node to be dumped. It can be either `WorkChainNode` or `CalcJobNode` :param output_path: The main output path where the directory tree will be created. :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new @@ -79,7 +83,7 @@ def dump( output_path = self.generate_default_dump_path(process_node=process_node) try: - self.dump_path_validate_make(validate_path=output_path) + self.validate_make_dump_path(validate_path=output_path) except: raise @@ -87,10 +91,9 @@ def dump( # `dump` function called by `verdi`, then I need to dump for the `CalcFunction` here already, as well. self.dump_node_yaml(process_node=process_node, output_path=output_path) if isinstance(process_node, CalcFunctionNode): - self.dump_calculation_node( + self._dump_calculation( calculation_node=process_node, output_path=output_path, - include_inputs=self.include_node_inputs, io_dump_paths=io_dump_paths, ) @@ -103,7 +106,7 @@ def dump( for index, link_triple in enumerate(sorted_called_links, start=1): child_node = link_triple.node - child_label = self.generate_calcjob_input_node_label(index=index, link_triple=link_triple) + child_label = self.generate_child_node_label(index=index, link_triple=link_triple) child_output_path = output_path.resolve() / child_label # Recursive function call for `WorkFlowNode`` @@ -115,12 +118,12 @@ def dump( # Once a `CalculationNode` as child reached, dump it elif isinstance(child_node, CalculationNode): - self.dump_calculation_node( + self._dump_calculation( calculation_node=child_node, output_path=child_output_path, ) - def dump_calculation_node( + def _dump_calculation( self, calculation_node: CalculationNode, output_path: Path | None, @@ -139,117 +142,65 @@ def dump_calculation_node( output_path = self.generate_default_dump_path(process_node=calculation_node) try: - self.dump_path_validate_make(validate_path=output_path) + self.validate_make_dump_path(validate_path=output_path) except: # raise same exception here to communicate it outwards raise - if io_dump_paths is None: - io_dump_paths = self.generate_calculation_io_dump_paths(calculation_io_dump_paths=io_dump_paths) + self.dump_node_yaml(process_node=calculation_node, output_path=output_path) + + io_dump_mapping = self.generate_calculation_io_mapping(io_dump_paths=io_dump_paths) - # Dump the raw_inputs + # Dump the repository contents of the node # ? Rename this to node_repository or something -> Introduces AiiDA terminology.But as we provide the option to # ? dump *all* the outputs, we should also provide the option to dump *all* the inputs, not just `node_inputs` - calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[0]) + calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_mapping.repository) - # Dump the raw_outputs - output_nodes = [calculation_node.outputs[output] for output in calculation_node.outputs] - for output_node in output_nodes: - if isinstance(output_node, FILE_NODES): - output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1]) - elif self.all_aiida_nodes: - output_node.base.repository.copy_tree(output_path.resolve() / io_dump_paths[1] / '.aiida_nodes') - - # Dump the node_inputs + # Dump the extra_inputs if self.include_node_inputs: input_node_triples = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, node_triples=input_node_triples) - for input_node_triple in input_node_triples: - input_node_path = self.generate_calculation_input_node_path( - input_node_triple=input_node_triple, - parent_path=output_path / io_dump_paths[2], - ) - - # No .resolve() required as that done in `generate_calcjob_input_node_path` - input_node_triple.node.base.repository.copy_tree(input_node_path) - - self.dump_node_yaml(process_node=calculation_node, output_path=output_path) - - def dump_node_yaml( - self, - process_node: ProcessNode, - output_path: Path, - output_filename: str = '.aiida_node_metadata.yaml', - ) -> None: - """Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. - - :param process_node: The ProcessNode to dump. - :param output_path: The path to the directory where the yaml file will be saved. - :param output_filename: The name of the output yaml file. Defaults to `.aiida_node_metadata.yaml`. - :return: None - """ - - _node_properties = [ - 'label', - 'description', - 'pk', - 'uuid', - 'ctime', - 'mtime', - 'node_type', - 'process_type', - 'is_finished_ok', - ] - - _user_properties = ('first_name', 'last_name', 'email', 'institution') - - _computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') - - node_dict = {} - metadata_dict = {} - - # Add actual node `@property`s to dictionary - for metadata_property in _node_properties: - metadata_dict[metadata_property] = getattr(process_node, metadata_property) - - node_dict['Node data'] = metadata_dict + # Dump the raw_outputs + output_node_triples = calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE) + self._dump_calculation_io( + parent_path=output_path / io_dump_mapping.outputs, + node_triples=output_node_triples, + ) - # Add user data - try: - node_dbuser = process_node.user - user_dict = {} - for user_property in _user_properties: - user_dict[user_property] = getattr(node_dbuser, user_property) - node_dict['User data'] = user_dict - except AttributeError: - pass + def _dump_calculation_io(self, parent_path: Path, node_triples: LinkManager): + # if exception_paths is None: + # exception_paths = ['/pseudos/'] - # Add computer data - try: - node_dbcomputer = process_node.computer - computer_dict = {} - for computer_property in _computer_properties: - computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) - node_dict['Computer data'] = computer_dict - except AttributeError: - pass - - # Add node attributes - if self.include_attributes: - node_attributes = process_node.base.attributes.all - node_dict['Node attributes'] = node_attributes + for node_triple in node_triples: + node = node_triple.node + if isinstance(node, FILE_CLASSES) or any(issubclass(type(node), cls) for cls in FILE_CLASSES): + file_node_path = self.generate_link_triple_dump_path( + link_triple=node_triple, + parent_path=parent_path, + ) - # Add node extras - if self.include_extras: - node_extras = process_node.base.extras.all - if node_extras: - node_dict['Node extras'] = node_extras + # No .resolve() required as that done in `generate_calcjob_input_node_path` + node_triple.node.base.repository.copy_tree(file_node_path) - output_file = output_path.resolve() / output_filename - with open(output_file, 'w') as handle: - yaml.dump(node_dict, handle, sort_keys=False) + else: + aiida_node_path = self.generate_link_triple_dump_path( + link_triple=node_triple, + parent_path=parent_path / '.aiida_nodes', + ) - def dump_path_validate_make(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: + # This is again QE specific, but, frankly, I don't know how to otherwise separate the pseudos from the + # rest of the AiiDA-nodes, as the pseudos are of `Data` type (why not UpfData?), so I cannot distinguish + # them from other AiiDA-nodes, such as ArrayData which we definitely want in the hidden `.aiida_nodes` + # subdirectory + # So if anybody has a better solution, I'd be happy to use that + # The problem might be void, though, once all the atomistic code is moved to `aiida-atomistic` + if node.node_type == 'data.pseudo.upf.UpfData.': + node_triple.node.base.repository.copy_tree(Path(str(aiida_node_path).replace('.aiida_nodes', ''))) + elif self.all_aiida_nodes: + node_triple.node.base.repository.copy_tree(aiida_node_path) + + def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: """ Create default dumping directory for a given process node and return it as absolute path. @@ -271,7 +222,7 @@ def dump_path_validate_make(self, validate_path: Path, safeguard_file: str = '.a # Check for safeguard file ('.aiida_node_metadata.yaml') for safety # If present -> Remove directory elif (validate_path / safeguard_file).is_file(): - self.logger.info(f'Overwrite set to true, will overwrite directory `{validate_path}`.') + LOGGER.info(f'Overwrite set to true, will overwrite directory `{validate_path}`.') shutil.rmtree(validate_path) # Existing and non-empty directory and overwrite True @@ -289,64 +240,104 @@ def dump_path_validate_make(self, validate_path: Path, safeguard_file: str = '.a return validate_path.resolve() - def generate_calculation_io_dump_paths(self, calculation_io_dump_paths: list | None = None): - default_calculation_io_dump_paths = ['raw_inputs', 'raw_outputs', 'node_inputs'] - empty_calculation_io_dump_paths = ['', '', ''] + def generate_default_dump_path(self, process_node: ProcessNode | None) -> Path: + """Simple helper function to generate the default parent-dumping directory if none given. + + This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as + it just creates the default parent folder for the dumping, if no name is given. - if self.flat and calculation_io_dump_paths is None: - self.logger.info( + :param process_node: The `ProcessNode` for which the directory is created. + :return: The created parent dump path. + """ + if process_node is None: + raise TypeError('`process_node` must be provided for generating the default path.') + else: + pk = process_node.pk + try: + return Path(f'dump-{process_node.process_label}-{pk}') + except AttributeError: + # ? This case came up during testing, not sure how relevant it actually is + return Path(f'dump-{process_node.process_type}-{pk}') + + def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None): + # Could turn this into a dict/mapping and use as labels what the entities actually refer to + # Don't use AiiDA terminology directly as it might be confusing for other users who are mainly targeted for the + # dumping + # ? Could move this outside of class and just pass flat, and not set the logger as a class attribute + + from types import SimpleNamespace + + aiida_entities_to_dump = ['repository', 'inputs', 'outputs'] + default_calculation_io_dump_paths = ['raw_inputs', 'extra_inputs', 'raw_outputs'] + empty_calculation_io_dump_paths = [''] * 3 + + if self.flat and io_dump_paths is None: + LOGGER.info( 'Flat set to True and no `io_dump_paths`. Dumping in a flat directory, files might be overwritten.' ) - return empty_calculation_io_dump_paths - elif self.flat and calculation_io_dump_paths is not None: - self.logger.info( - 'Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.' + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, empty_calculation_io_dump_paths))) + + elif not self.flat and io_dump_paths is None: + LOGGER.info( + 'Flat set to False but no `io_dump_paths` provided. ' + + f'Will use the defaults {default_calculation_io_dump_paths}.' ) - return default_calculation_io_dump_paths - elif not self.flat and calculation_io_dump_paths is None: - self.logger.info( - f'Flat set to False but no `io_dump_paths` provided. Will use the defaults {default_calculation_io_dump_paths}.' + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) + + elif self.flat and io_dump_paths is not None: + LOGGER.info( + 'Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.' ) - return default_calculation_io_dump_paths + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: - self.logger.info( + LOGGER.info( 'Flat set to False but no `io_dump_paths` provided. These will be used, but `node_inputs` flattened.' ) - return empty_calculation_io_dump_paths - - def generate_calculation_input_node_path( - self, input_node_triple, parent_path, exception_labels: list | None = None - ) -> Path: - input_node = input_node_triple.node - link_label = input_node_triple.link_label + return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] - if exception_labels is None: - exception_labels = ['pseudos'] - - if len(input_node.base.repository.list_objects()) > 0: - # Empty repository, so it should be standard AiiDA data types, like Int, Float, etc. - aiida_nodes_subdir = '' - else: - aiida_nodes_subdir = '.aiida_nodes' + def generate_link_triple_dump_path(self, link_triple: LinkTriple, parent_path: Path) -> Path: + node = link_triple.node + link_label = link_triple.link_label + # For convenience, remove the 'retrieved' subdirectory for the outputs + link_label = link_label.replace('retrieved', '') + + # This is again QE specific... + # if exception_labels is None: + # exception_labels = ['pseudos'] + + # ? Shouldn't this be only be applied to `CalculationNode`s? + # This check is necessary, as I'm now not only dumping the 'retrieved' outputs and file-based inputs, but all of + # the connected links if self.all_aiida_nodes is True + # ? This is now handled outside, before this function is called + # if len(node.base.repository.list_objects()) > 0: + # aiida_nodes_subdir = '' + # else: + # # Empty repository, so it should be non-file-based AiiDA data types, like ArrayData + # # -> Put those into '.aiida_nodes' subdirectory + # aiida_nodes_subdir = '.aiida_nodes' + # # aiida_nodes_subdir = '' # ? The check if the link_label starts with pseudo is again very specific for the atomistic community/QE, # ? however, I don't know how to otherwise avoid that it's put in `.aiida_nodes`, as the Node is defined as # ? Data, not UpfData, so I cannot just check against FILE_NODES - if isinstance(input_node, FILE_NODES) or any(link_label.startswith(label) for label in exception_labels): + # if isinstance(node, FILE_CLASSES) or any(link_label.startswith(label) for label in exception_labels): + if isinstance(node, FILE_CLASSES): if not self.flat: input_node_path = parent_path / Path(*link_label.split('__')) else: # Don't use link_label at all -> But, relative path inside FolderData is retained input_node_path = parent_path elif not self.flat: - input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) + # input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) + input_node_path = parent_path / Path(*link_label.split('__')) else: # Don't use link_label at all -> But, relative path inside FolderData is retained - input_node_path = parent_path / aiida_nodes_subdir + # input_node_path = parent_path / aiida_nodes_subdir + input_node_path = parent_path return input_node_path.resolve() - def generate_calcjob_input_node_label(self, index: int, link_triple: LinkTriple) -> str: + def generate_child_node_label(self, index: int, link_triple: LinkTriple) -> str: """Small helper function to generate the directory label for node inputs.""" node = link_triple.node link_label = link_triple.link_label @@ -367,30 +358,89 @@ def generate_calcjob_input_node_label(self, index: int, link_triple: LinkTriple) label_list += [process_type] # node_label += f'-{process_type}' - if isinstance(node, File): - label_list += [node.name] + # if isinstance(node, File): + # label_list += [node.name] node_label = '-'.join(label_list) # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove node_label = node_label.replace('CALL-', '') + node_label = node_label.replace('None-', '') return node_label - def generate_default_dump_path(self, process_node: ProcessNode) -> Path: - """Simple helper function to generate the default parent-dumping directory if none given. - - This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as it - just creates the default parent folder for the dumping, if no name is given. + def dump_node_yaml( + self, + process_node: ProcessNode, + output_path: Path, + output_filename: str = '.aiida_node_metadata.yaml', + ) -> None: + """Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. - :param process_node: The `ProcessNode` for which the directory is created. - :return: The created parent dump path. + :param process_node: The ProcessNode to dump. + :param output_path: The path to the directory where the yaml file will be saved. + :param output_filename: The name of the output yaml file. Defaults to `.aiida_node_metadata.yaml`. + :return: None """ + _node_properties = [ + 'label', + 'description', + 'pk', + 'uuid', + 'ctime', + 'mtime', + 'node_type', + 'process_type', + 'is_finished_ok', + ] + + _user_properties = ('first_name', 'last_name', 'email', 'institution') + + _computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') + + node_dict = {} + metadata_dict = {} + + # Add actual node `@property`s to dictionary + for metadata_property in _node_properties: + metadata_dict[metadata_property] = getattr(process_node, metadata_property) + + node_dict['Node data'] = metadata_dict + + # Add user data try: - return Path(f'dump-{process_node.process_label}-{process_node.pk}').resolve() + node_dbuser = process_node.user + user_dict = {} + for user_property in _user_properties: + user_dict[user_property] = getattr(node_dbuser, user_property) + node_dict['User data'] = user_dict except AttributeError: - # ? This case came up during testing, not sure how relevant it actually is - return Path(f'dump-{process_node.process_type}-{process_node.pk}').resolve() + pass + + # Add computer data + try: + node_dbcomputer = process_node.computer + computer_dict = {} + for computer_property in _computer_properties: + computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) + node_dict['Computer data'] = computer_dict + except AttributeError: + pass + + # Add node attributes + if self.include_attributes: + node_attributes = process_node.base.attributes.all + node_dict['Node attributes'] = node_attributes + + # Add node extras + if self.include_extras: + node_extras = process_node.base.extras.all + if node_extras: + node_dict['Node extras'] = node_extras + + output_file = output_path.resolve() / output_filename + with open(output_file, 'w') as handle: + yaml.dump(node_dict, handle, sort_keys=False) # ? Add type hints here? Would require loading from ORM in header of `cmd_` file -> Might fail CLI time validation def generate_parent_readme(self): @@ -411,18 +461,21 @@ def generate_parent_readme(self): get_workchain_report, ) + if self.parent_process is None or self.parent_path is None: + raise TypeError('parent_process and parent_path must be set before README can be created.') + _readme_string = textwrap.dedent( f"""\ - This directory contains the files involved in the simulation/workflow `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. + This directory contains the files involved in the calculation/workflow `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. - Child simulations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are + Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried by running `verdi process status {self.parent_process.pk}` on the command line. By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "node_inputs". + settings). Additional input files (depending on the type of calculation) are placed in the "extra_inputs". Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection.""" # noqa: E501 @@ -430,7 +483,7 @@ def generate_parent_readme(self): # `verdi process status` process_status = format_call_graph(calc_node=self.parent_process, max_depth=None, call_link_label=True) - _readme_string += f'\n\nOutput of `verdi process status`\n\n{process_status}' + _readme_string += f'\n\n\nOutput of `verdi process status {self.parent_process.pk}:`\n\n{process_status}' # `verdi process report` # Copied over from `cmd_process` @@ -445,10 +498,11 @@ def generate_parent_readme(self): else: process_report = f'Nothing to show for node type {self.parent_process.__class__}' - _readme_string += f'\n\nOutput of `verdi process report`\n\n{process_report}' + _readme_string += f'\n\n\nOutput of `verdi process report {self.parent_process.pk}`:\n\n{process_report}' # `verdi process show`? process_show = get_node_info(node=self.parent_process) - _readme_string += f'\n\nOutput of `verdi process show`\n\n{process_show}' + _readme_string += f'\n\n\nOutput of `verdi process show {self.parent_process.pk}`:\n\n{process_show}' - (self.parent_path / 'README').write_text(_readme_string) + with (self.parent_path / 'README').open('w') as handle: + handle.write(_readme_string) diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index 0a7bd65f1d..fa22358edf 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -22,6 +22,7 @@ from aiida.engine import Process, ProcessState from aiida.engine.processes import control as process_control from aiida.orm import CalcJobNode, Group, WorkChainNode, WorkflowNode, WorkFunctionNode +from aiida.cmdline.utils.echo import ExitCode from tests.utils.processes import WaitProcess @@ -336,28 +337,43 @@ def test_report(self, run_cli_command): assert len(result.output_lines) == 1, result.output_lines assert result.output_lines[0] == 'No log messages recorded for this entry' - def test_process_dump(self, run_cli_command): + def test_process_dump(self, run_cli_command, tmp_path, generate_workchain_multiply_add): """Test verdi process dump""" - node = WorkflowNode().store() - # node.set_process_state(ProcessState.RUNNING) - - # # Running without identifiers should not except and not print anything - # options = [] - # result = run_cli_command(cmd_process.process_status, options) - # assert result.exception is None, result.output - # assert len(result.output_lines) == 0 - - # # Giving a single identifier should print a non empty string message - # options = [str(node.pk)] - # result = run_cli_command(cmd_process.process_status, options) - # assert result.exception is None, result.output - # assert len(result.output_lines) > 0 - - # # With max depth 0, the output should be empty - # options = ['--max-depth', 0, str(node.pk)] - # result = run_cli_command(cmd_process.process_status, options) - # assert result.exception is None, result.output - # assert len(result.output_lines) == 0 + + # Only test CLI interface here, the actual functionalities of the Python API are tested in `test_processes.py` + test_path = tmp_path / 'cli-dump' + node = generate_workchain_multiply_add() + + # Running without identifiers should not except and not print anything + options = [] + result = run_cli_command(cmd_process.process_dump, options) + assert result.exception is None, result.output + assert len(result.output_lines) == 0 + + # Giving a single identifier should print a non empty string message + options = [str(node.pk), '-p', str(test_path)] + result = run_cli_command(cmd_process.process_dump, options) + assert result.exception is None, result.output + assert 'Success:' in result.output + + # Trying to run the dumping again in the same path but without overwrite=True should raise exception + options = [str(node.pk), '-p', str(test_path)] + result = run_cli_command(cmd_process.process_dump, options, raises=True) + assert result.exit_code is ExitCode.CRITICAL + + # Works fine when using overwrite=True + options = [str(node.pk), '-p', str(test_path), '-o'] + result = run_cli_command(cmd_process.process_dump, options) + assert result.exception is None, result.output + assert 'Success:' in result.output + + # Set overwrite=True but provide bad directory, i.e. missing metadata file + (test_path / '.aiida_node_metadata.yaml').unlink() + + options = [str(node.pk), '-p', str(test_path), '-o'] + result = run_cli_command(cmd_process.process_dump, options, raises=True) + assert result.exit_code is ExitCode.CRITICAL + @pytest.mark.usefixtures('aiida_profile_clean') @pytest.mark.parametrize('numprocesses, percentage', ((0, 100), (1, 90))) diff --git a/tests/conftest.py b/tests/conftest.py index c06a8e540b..db31d71750 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -162,12 +162,12 @@ def generate_calculation_node(): from aiida.engine import ProcessState def _generate_calculation_node( - process_state=ProcessState.FINISHED, - exit_status=None, - entry_point=None, + process_state: ProcessState = ProcessState.FINISHED, + exit_status: int | None = None, + entry_point: str | None = None, inputs: dict | None = None, outputs: dict | None = None, - repository: str | pathlib.Path = None, + repository: pathlib.Path | None = None, ): """Generate an instance of a `CalculationNode`.. @@ -180,13 +180,41 @@ def _generate_calculation_node( if process_state is ProcessState.FINISHED and exit_status is None: exit_status = 0 - node = CalculationNode(process_type=entry_point) - node.set_process_state(process_state) + calculation_node = CalculationNode(process_type=entry_point) + calculation_node.set_process_state(process_state) if exit_status is not None: - node.set_exit_status(exit_status) + calculation_node.set_exit_status(exit_status) - return node + if repository is not None: + # ? Possibly allow for other types here, and use `put_object_from_filelike`, etc. + # ? Or use from_tree + calculation_node.base.repository.put_object_from_filelike(repository[0], repository[1]) + + # For storing, need to first store the input nodes, then the CalculationNode, then the output nodes + + if inputs is not None: + for input_label, input_node in inputs.items(): + calculation_node.base.links.add_incoming( + input_node, + link_type=LinkType.INPUT_CALC, + link_label=input_label, + ) + + input_node.store() + + if outputs is not None: + calculation_node.store() + for output_label, output_node in outputs.items(): + + output_node.base.links.add_incoming( + calculation_node, link_type=LinkType.CREATE, link_label=output_label + ) + + output_node.store() + + # ? Should it be stored here + return calculation_node return _generate_calculation_node @@ -683,9 +711,13 @@ def reset_log_level(): log.configure_logging(with_orm=True) +# todo: Provide option to manually construct the nodes or actually submit the processes +# todo: Depending on how long this takes, replace this duplicated code around the code base with these fixtures + @pytest.fixture -def generate_arithmetic_add_node(): - def _generate_arithmetic_add_node(computer): +def generate_calculation_node_add(aiida_localhost): + # todo: Benchmark how long running this takes vs. manually constructing it + def _generate_calculation_node_add(): from aiida.engine import run_get_node from aiida.orm import InstalledCode, Int from aiida.plugins import CalculationFactory @@ -695,19 +727,20 @@ def _generate_arithmetic_add_node(computer): add_inputs = { 'x': Int(1), 'y': Int(2), - 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + 'code': InstalledCode(computer=aiida_localhost, filepath_executable='/bin/bash'), } _, add_node = run_get_node(arithmetic_add, **add_inputs) return add_node - return _generate_arithmetic_add_node + return _generate_calculation_node_add @pytest.fixture -def generate_multiply_add_node(): - def _generate_multiply_add_node(computer): +def generate_workchain_multiply_add(aiida_localhost): + # todo: Benchmark how long running this takes vs. manually constructing it + def _generate_workchain_multiply_add(): from aiida.engine import run_get_node from aiida.orm import InstalledCode, Int from aiida.plugins import WorkflowFactory @@ -718,96 +751,11 @@ def _generate_multiply_add_node(computer): 'x': Int(1), 'y': Int(2), 'z': Int(3), - 'code': InstalledCode(computer=computer, filepath_executable='/bin/bash'), + 'code': InstalledCode(computer=aiida_localhost, filepath_executable='/bin/bash'), } _, multiply_add_node = run_get_node(multiplyaddworkchain, **multiply_add_inputs) return multiply_add_node - return _generate_multiply_add_node - - -@pytest.fixture -def generate_calcjob_node_io(): - def _generate_calcjob_node_io( - attach_repository: bool = True, attach_node_inputs: bool = True, attach_outputs: bool = True, entry_point=None - ): - # `entry_point='aiida.calculations:core.arithmetic.add'` leads to error due to the unknown names of the attached - # nodes, which are not defined in the `Calculation` itself - # calcjob_node = generate_calculation_node(entry_point=entry_point) - - from aiida.orm import CalcJobNode - - calcjob_node = CalcJobNode() - - filename: str = 'file.txt' - - # Attach raw inputs to node repository - if attach_repository: - calcjob_node.base.repository.put_object_from_filelike(io.StringIO('a'), path=filename) - - # Attach node inputs - if attach_node_inputs: - # Set up labels - singlefiledata_linklabel: str = 'singlefile_input' - folderdata_linklabel: str = 'folderdata_input' - - # Generate SinglefileData, Folderdata, and CalcJobNode nodes - singlefiledata_node = SinglefileData.from_string(content='a', filename=filename) - singlefiledata_node.store() - - folderdata_node = FolderData() - folderdata_node.put_object_from_filelike(io.StringIO('a'), path=pathlib.Path('relative_path') / filename) - folderdata_node.store() - - calcjob_node.base.links.add_incoming( - singlefiledata_node, - link_type=LinkType.INPUT_CALC, - link_label=singlefiledata_linklabel, - ) - - calcjob_node.base.links.add_incoming( - folderdata_node, link_type=LinkType.INPUT_CALC, link_label=folderdata_linklabel - ) - - # Attach `retrieved` outputs - if attach_outputs: - # Not storing gives: - # `aiida.common.exceptions.ModificationNotAllowed: Cannot store because source node of link triple - # LinkTriple(...) is not stored`` - calcjob_node.store() - - retrieved_node = FolderData() - retrieved_node.put_object_from_filelike(io.StringIO('a'), path=filename) - retrieved_node.base.links.add_incoming(calcjob_node, link_type=LinkType.CREATE, link_label='retrieved') - retrieved_node.store() - - return calcjob_node - - return _generate_calcjob_node_io - - -@pytest.fixture -def generate_workchain_node_io(): - def _generate_workchain_node_io(cj_nodes): - """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `CalcJob` with file io.""" - from aiida.orm import WorkChainNode - - wc_node = WorkChainNode() - wc_node_sub = WorkChainNode() - - # Add sub-workchain that calls a calcjob - wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workchain') - for i, cj_node in enumerate(cj_nodes): - cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label=f'calcjob_{i}') - - # Need to store nodes so that the relationships are being picked up in the `get_outgoing` call (for - # `get_incoming` they work without being stored) - wc_node.store() - wc_node_sub.store() - [cj_node.store() for cj_node in cj_nodes] - - return wc_node - - return _generate_workchain_node_io + return _generate_workchain_multiply_add diff --git a/tests/test_conftest.py b/tests/test_conftest.py index 22d990d323..a5cecb406b 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -76,3 +76,7 @@ def test_entry_points_add_and_load(entry_points): with pytest.raises(RuntimeError, match='inline function was called'): entry_point() + +# ? +def test_generate_calculation_node(): + pass \ No newline at end of file diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index c4a46e5ef3..0f4c188fa5 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -13,34 +13,32 @@ # ? However, when one passes tmp_dir as output_path, no automatic path is created, as not at the default value is set, # ? so str(output_path) == '.' is False -import pathlib +import io import shutil from pathlib import Path import pytest -from aiida.tools.dumping.processes import ( - _calculation_node_dump, - calculation_node_inputs_dump, - generate_default_dump_path, - generate_node_input_label, - process_node_dump, - validate_make_dump_path, -) - -# Define parameters for the dumping +from aiida.common.links import LinkType +from aiida.tools.dumping.processes import ProcessDumper + +# ? Non-AiiDA variables filename = 'file.txt' filecontent = 'a' -raw_inputs_relpath = 'raw_inputs' -raw_outputs_relpath = 'raw_outputs' -node_inputs_relpath = 'node_inputs' -default_dump_paths = [raw_inputs_relpath, raw_outputs_relpath, node_inputs_relpath] +raw_inputs_relpath = Path('raw_inputs') +raw_outputs_relpath = Path('raw_outputs') +node_inputs_relpath = Path('extra_inputs') +default_dump_paths = [raw_inputs_relpath, node_inputs_relpath, raw_outputs_relpath] custom_dump_paths = [f'{path}_' for path in default_dump_paths] # Define some variables used for constructing the nodes used to test the dumping -singlefiledata_linklabel = 'singlefile_input' -folderdata_linklabel = 'folderdata_input' -folderdata_internal_path = 'relative_path' -folderdata_path = pathlib.Path(f'{folderdata_linklabel}/{folderdata_internal_path}') +singlefiledata_linklabel = 'singlefile' +folderdata_linklabel = 'folderdata' +folderdata_relpath = Path('relative_path') +folderdata_test_path = folderdata_linklabel / folderdata_relpath +arraydata_linklabel = 'arraydata' +node_metadata_file = '.aiida_node_metadata.yaml' + +# todo: Test for _LOGGER.info outputs # ? Move this somewhere else? @@ -57,301 +55,161 @@ def clean_tmp_path(tmp_path: Path): item.unlink() -def test_calcjob_node_inputs_dump(tmp_path, generate_calcjob_node_io): - """Test that dumping of CalcJob node inputs works correctly.""" - - calcjob_node = generate_calcjob_node_io() - dump_parent_path = tmp_path / node_inputs_relpath - - # Check the dumping results with flat=False - - # Expected tree: - # node_inputs - # ├── folderdata_input - # │ └── relative_path - # │ └── file.txt - # └── singlefile_input - # └── file.txt - - calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=dump_parent_path) - assert (dump_parent_path / singlefiledata_linklabel).is_dir() - assert (dump_parent_path / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / folderdata_path).is_dir() - assert (dump_parent_path / folderdata_path / filename).is_file() - - with open(dump_parent_path / singlefiledata_linklabel / filename, 'r') as handle: - assert handle.read() == filecontent - - with open(dump_parent_path / folderdata_path / filename, 'r') as handle: - assert handle.read() == filecontent - - # Probably not actually necessary, as in the previous step they are dumped to `node_inputs` - clean_tmp_path(tmp_path=tmp_path) - - # Check the dumping results with flat=True - - # Expected tree: - # node_inputs - # ├── file.txt - # └── relative_path - # └── file.txt - - calculation_node_inputs_dump(calculation_node=calcjob_node, output_path=dump_parent_path, flat=True) - - # Flat=True doesn't flatten nested directory structure of FolderData objects -> Leave relative path - assert (dump_parent_path / folderdata_internal_path).is_dir() - assert (dump_parent_path / folderdata_internal_path / filename).is_file() - - assert (dump_parent_path / filename).is_file() - with open(dump_parent_path / filename, 'r') as handle: - assert handle.read() == filecontent - - with open(dump_parent_path / folderdata_internal_path / filename, 'r') as handle: - assert handle.read() == filecontent - - # todo: test here with ArithmeticAdd as well - - -def test_calcjob_dump_io(generate_calcjob_node_io, tmp_path): - - dump_parent_path = tmp_path / 'cj-dump-test-io' - - # Here, check for attached `retrieved` outputs, as well - calcjob_node = generate_calcjob_node_io() - - # todo: Test for _LOGGER.info outputs - # Checking the actual content should be handled by `test_copy_tree` - # Not testing for the folderdata-input here, as this should be covered by `test_calcjob_node_inputs_dump` - # It is dumped to 'relative_path/file.txt' in all cases, though, but just ignore - - # Normal dumping -> node_inputs and not flat; no paths provided - - # Expected tree: - # cj-dump-test-io - # ├── node_inputs - # │ ├── folderdata_input - # │ │ └── relative_path - # │ │ └── file.txt - # │ └── singlefile_input - # │ └── file.txt - # ├── raw_inputs - # │ └── file.txt - # └── raw_outputs - # └── file.txt - - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path) - - assert (dump_parent_path / default_dump_paths[0] / filename).is_file() - assert (dump_parent_path / default_dump_paths[1] / filename).is_file() - assert (dump_parent_path / default_dump_paths[2] / singlefiledata_linklabel / filename).is_file() - - clean_tmp_path(tmp_path=tmp_path) - - # Normal dumping -> node_inputs and not flat; custom paths provided - - # Expected tree: - # cj-dump-test-io - # ├── node_inputs_ - # │ ├── folderdata_input - # │ │ └── relative_path - # │ │ └── file.txt - # │ └── singlefile_input - # │ └── file.txt - # ├── raw_inputs_ - # │ └── file.txt - # └── raw_outputs_ - # └── file.txt - - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths) - assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs - assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs - # node_inputs, singlefile - assert (dump_parent_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() - - clean_tmp_path(tmp_path=tmp_path) - - # Flat dumping -> no paths provided -> Default paths should not be existent. Internal FolderData structure retained. - # Expected tree: - # cj-dump-test-io - # ├── file.txt - # └── relative_path - # └── file.txt - - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, flat=True) - assert not (dump_parent_path / default_dump_paths[0] / filename).is_file() # raw_inputs - assert not (dump_parent_path / default_dump_paths[1] / filename).is_file() # raw_outputs - assert not (dump_parent_path / default_dump_paths[2] / filename).is_file() # node_inputs, singlefile - # Here, the same file will be written by raw_inputs and raw_outputs and node_inputs - # So it should only be present in the parent dump directory - assert (dump_parent_path / filename).is_file() - - clean_tmp_path(tmp_path=tmp_path) - - # Flat dumping -> node_inputs and custom paths provided -> Test in custom paths, - # But no subdirectories named after the link-labels under `node_inputs_` - # Expected path: - # cj-dump-test-io - # ├── node_inputs_ - # │ ├── file.txt - # │ └── relative_path - # │ └── file.txt - # ├── raw_inputs_ - # │ └── file.txt - # └── raw_outputs_ - # └── file.txt +# Helper methods to generate the actual `WorkChain`s and `CalcJob`s used for testing +@pytest.fixture +def generate_calculation_node_io(generate_calculation_node): + def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): + import numpy as np + from aiida.orm import ArrayData, FolderData, SinglefileData + + calculation_repository = (io.StringIO(filecontent), filename) + singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) + # ? Use instance for folderdata + folderdata = FolderData() + folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=folderdata_relpath / filename) + arraydata_input = ArrayData(arrays=np.ones(3)) + + # Create calculation inputs, outputs + calculation_node_inputs = { + singlefiledata_linklabel: singlefiledata_input, + folderdata_linklabel: folderdata, + arraydata_linklabel: arraydata_input, + # todo: also add some of the other AiiDA nodes, like Int here + } + + singlefiledata_output = singlefiledata_input.clone() + folderdata_output = folderdata.clone() + + if attach_outputs: + calculation_outputs = { + folderdata_linklabel: folderdata_output, + singlefiledata_linklabel: singlefiledata_output, + # todo: also add some of the other AiiDA nodes, like Int here + } + else: + calculation_outputs = None - # todo: Test case of splitting the nested node_inputs based on double-underscore splitting not covered with the test - # todo: setup. This might be again too specific for QE? - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths, flat=True) - assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() # raw_inputs - assert (dump_parent_path / custom_dump_paths[1] / filename).is_file() # raw_outputs - assert (dump_parent_path / custom_dump_paths[2] / filename).is_file() # node_inputs, singlefile + calculation_node = generate_calculation_node( + repository=calculation_repository, + inputs=calculation_node_inputs, + outputs=calculation_outputs, + entry_point=entry_point, + ) + return calculation_node - clean_tmp_path(tmp_path=tmp_path) + return _generate_calculation_node_io - # Don't dump the connected node inputs for both, flat is True/False - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False) - assert not (dump_parent_path / custom_dump_paths[2]).is_dir() - clean_tmp_path(tmp_path=tmp_path) +@pytest.fixture +def generate_workchain_node_io(): + def _generate_workchain_node_io(cj_nodes, store_all: bool = True): + """Generate an instance of a `WorkChain` that contains a sub-`WorkChain` and a `Calculation` with file io.""" + from aiida.orm import WorkflowNode - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, include_inputs=False, flat=True) - assert not (dump_parent_path / custom_dump_paths[2]).is_dir() + wc_node = WorkflowNode() + wc_node_sub = WorkflowNode() - clean_tmp_path(tmp_path=tmp_path) + # Add sub-workchain that calls a calculation + wc_node_sub.base.links.add_incoming(wc_node, link_type=LinkType.CALL_WORK, link_label='sub_workflow') + for cj_node in cj_nodes: + cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') - # Check that it fails when it tries to create the same directory without overwrite=True - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) - with pytest.raises(FileExistsError): - _calculation_node_dump(calcjob_node=calcjob_node, output_path=dump_parent_path, overwrite=False) + # Need to store so that outputs are being dumped + if store_all: + wc_node.store() + wc_node_sub.store() + [cj_node.store() for cj_node in cj_nodes] + return wc_node -def test_calcjob_dump_arithmetic_add(tmp_path, aiida_localhost, generate_arithmetic_add_node): - dump_path = tmp_path / 'calcjob_dump_arithmetic_add' + return _generate_workchain_node_io - # Now, generate `CalcJobNode` from ArithmeticAddCalculation - add_node = generate_arithmetic_add_node(computer=aiida_localhost) - # Normal dumping of ArithmeticAddCalculation node - _calculation_node_dump(calcjob_node=add_node, output_path=dump_path) - - raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - raw_input_files = [dump_path / default_dump_paths[0] / raw_input_file for raw_input_file in raw_input_files] - raw_output_files = [dump_path / default_dump_paths[1] / raw_output_file for raw_output_file in raw_output_files] - - assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) - assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) - - clean_tmp_path(tmp_path=tmp_path) - - -def test_process_dump_io(generate_calcjob_node_io, generate_workchain_node_io, tmp_path): +# Tests for entry-point dump method +def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): # Expected tree: # wc-dump-test-io - # └── 01-sub_workchain - # └── 01-calcjob - # ├── node_inputs - # │ ├── folderdata_input + # └── 01-sub_workflow + # └── 01-calculation + # ├── extra_inputs + # │ ├── folderdata # │ │ └── relative_path # │ │ └── file.txt - # │ └── singlefile_input + # │ └── singlefile # │ └── file.txt # └── raw_inputs # └── file.txt - # Don't attach outputs, as this would require storing the calcjob_node, and it cannot be added. Dumping of outputs - # should be taken care of by `test_calcjob_dump` - cj_node = generate_calcjob_node_io(attach_outputs=False) - wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) - # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` - # Other option would be to cd into `tmp_path` and then letting the default label be created dump_parent_path = tmp_path / 'wc-dump-test-io' + process_dumper = ProcessDumper() + # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain + # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list + cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] + wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) + process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) # Don't test for `README` here, as this is only created when dumping is done via `verdi` - raw_input_path = '01-sub_workchain/01-calcjob_0/raw_inputs/file.txt' - singlefiledata_path = '01-sub_workchain/01-calcjob_0/node_inputs/singlefile_input/file.txt' - folderdata_path = '01-sub_workchain/01-calcjob_0/node_inputs/folderdata_input/relative_path/file.txt' + # But, check for aiida_node_metadata.yaml + raw_input_path = '01-sub_workflow/01-calculation/raw_inputs/file.txt' + singlefiledata_path = '01-sub_workflow/01-calculation/extra_inputs/singlefile/file.txt' + folderdata_path = '01-sub_workflow/01-calculation/extra_inputs/folderdata/relative_path/file.txt' node_metadata_paths = [ - '.aiida_node_metadata.yaml', - '01-sub_workchain/.aiida_node_metadata.yaml', - '01-sub_workchain/01-calcjob_0/.aiida_node_metadata.yaml', + node_metadata_file, + f'01-sub_workflow/{node_metadata_file}', + f'01-sub_workflow/01-calculation/{node_metadata_file}', + f'01-sub_workflow/02-calculation/{node_metadata_file}', ] expected_files = [raw_input_path, singlefiledata_path, folderdata_path, *node_metadata_paths] expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - process_node_dump(process_node=wc_node, output_path=dump_parent_path) - assert all([expected_file.is_file() for expected_file in expected_files]) - clean_tmp_path(tmp_path=dump_parent_path) - # Check directory tree when flat=True +def test_dump_flat(generate_calculation_node_io, generate_workchain_node_io, tmp_path): # Expected tree: - # wc-dump-test-io - # ├── file.txt - # └── relative_path - # └── file.txt - - process_node_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) - assert (dump_parent_path / filename).is_file() - # Internal hierarchy of the FolderData is retained - assert (dump_parent_path / folderdata_internal_path / filename).is_file() + # wc-dump-test-io-flat + # └── 01-sub_workflow + # ├── 01-calculation + # │ ├── file.txt + # │ └── relative_path + # │ └── file.txt + # └── 02-calculation + # ├── default.npy + # ├── file.txt + # └── relative_path + # └── file.txt - clean_tmp_path(tmp_path=dump_parent_path) - - # Check that dumping fails if multiple CalcJobs run by the workchain if flat=True - cj_nodes = [generate_calcjob_node_io(attach_outputs=False), generate_calcjob_node_io(attach_outputs=False)] + # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` + dump_parent_path = tmp_path / 'wc-dump-test-io-flat' + process_dumper = ProcessDumper(flat=True) + # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain + # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list + cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) - with pytest.raises(NotImplementedError): - process_node_dump(process_node=wc_node, output_path=dump_parent_path, flat=True) - - -def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_localhost): + process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - # Testing for files in hidden .aiida folder here, but not in more complex io functions - dump_parent_path = tmp_path / 'multiply_add-dump-test' - - multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - - # Dump with flat=True - # Expected tree: - # multiply_add-dump-test - # ├── .aiida - # │ ├── calcinfo.json - # │ └── job_tmpl.json - # ├── .aiida_node_metadata.yaml - # ├── _aiidasubmit.sh - # ├── _scheduler-stderr.txt - # ├── _scheduler-stdout.txt - # ├── aiida.in - # ├── aiida.out - #!└── source_file missing - - process_node_dump(process_node=multiply_add_node, output_path=dump_parent_path, flat=True) - - raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - raw_input_files += ['source_file'] - raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - raw_input_files = [dump_parent_path / raw_input_file for raw_input_file in raw_input_files] - raw_output_files = [dump_parent_path / raw_output_file for raw_output_file in raw_output_files] + # Don't test for `README` here, as this is only created when dumping is done via `verdi` + # But, check for aiida_node_metadata.yaml + raw_input_path = '01-sub_workflow/01-calculation/file.txt' + folderdata_path = '01-sub_workflow/01-calculation/relative_path/file.txt' + node_metadata_paths = [ + node_metadata_file, + f'01-sub_workflow/{node_metadata_file}', + f'01-sub_workflow/01-calculation/{node_metadata_file}', + f'01-sub_workflow/02-calculation/{node_metadata_file}', + ] - print(multiply_add_node.called_descendants) - print(multiply_add_node.called_descendants[0]) - print(type(multiply_add_node.called_descendants[0])) - print(multiply_add_node.called_descendants[0].base.repository.list_objects()) - print(multiply_add_node.called_descendants[0].base.repository.list_object_names()) + expected_files = [raw_input_path, folderdata_path, *node_metadata_paths] + expected_files = [dump_parent_path / expected_file for expected_file in expected_files] - # ! source_file is missing -> Why? - assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) - assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + assert all([expected_file.is_file() for expected_file in expected_files]) - clean_tmp_path(tmp_path=tmp_path) +def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): # Dump with flat=False # Expected tree: # multiply_add-dump-test @@ -369,18 +227,22 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l # ├── _scheduler-stderr.txt # ├── _scheduler-stdout.txt # └── aiida.out - - process_node_dump(process_node=multiply_add_node, output_path=dump_parent_path) + dump_parent_path = tmp_path / 'wc-dump-test-multiply-add' + process_dumper = ProcessDumper() + # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain + # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list + wc_node = generate_workchain_multiply_add() + process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] raw_input_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / default_dump_paths[0] / raw_input_file + dump_parent_path / '02-ArithmeticAddCalculation' / raw_inputs_relpath / raw_input_file for raw_input_file in raw_input_files ] - raw_input_files += [dump_parent_path / '01-multiply' / default_dump_paths[0] / 'source_file'] + raw_input_files += [dump_parent_path / '01-multiply' / raw_inputs_relpath / 'source_file'] raw_output_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / default_dump_paths[1] / raw_output_file + dump_parent_path / '02-ArithmeticAddCalculation' / raw_outputs_relpath / raw_output_file for raw_output_file in raw_output_files ] # No node_inputs contained in MultiplyAddWorkChain @@ -389,64 +251,211 @@ def test_process_dump_multiply_add(tmp_path, generate_multiply_add_node, aiida_l assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) +def test_dump_multiply_add_flat(tmp_path, generate_workchain_multiply_add): + # Expected tree: + # cj-dump-test-add + # ├── 01-multiply + # │ └── source_file + # └── 02-ArithmeticAddCalculation + # ├── _aiidasubmit.sh + # ├── _scheduler-stderr.txt + # ├── _scheduler-stdout.txt + # ├── aiida.in + # └── aiida.out + + dump_parent_path = tmp_path / 'cj-dump-test-add' + process_dumper = ProcessDumper(flat=True) + calculation_node_add = generate_workchain_multiply_add() + process_dumper.dump(process_node=calculation_node_add, output_path=dump_parent_path) + + multiply_file = dump_parent_path / '01-multiply' / 'source_file' + arithmetic_add_files = [ + '_aiidasubmit.sh', + 'aiida.in', + '.aiida/job_tmpl.json', + '.aiida/calcinfo.json', + '_scheduler-stderr.txt', + '_scheduler-stdout.txt', + 'aiida.out', + ] + # raw_input_files += ['source_file'] + arithmetic_add_files = [ + dump_parent_path / '02-ArithmeticAddCalculation' / arithmetic_add_file + for arithmetic_add_file in arithmetic_add_files + ] + assert all([expected_file.is_file() for expected_file in arithmetic_add_files]) + assert multiply_file.is_file() -def test_generate_default_dump_path(generate_arithmetic_add_node, generate_multiply_add_node, aiida_localhost): - add_node = generate_arithmetic_add_node(computer=aiida_localhost) - multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - add_path = generate_default_dump_path(process_node=add_node) - multiply_add_path = generate_default_dump_path(process_node=multiply_add_node) +# Tests for dump_calculation method +def test_dump_calculation_node(tmp_path, generate_calculation_node_io): + # Checking the actual content should be handled by `test_copy_tree` + # Normal dumping -> node_inputs and not flat; no paths provided + # Expected tree: + # cj-dump-test-io + # ├── extra_inputs + # │ ├── folderdata + # │ │ └── relative_path + # │ │ └── file.txt + # │ └── singlefile + # │ └── file.txt + # ├── raw_inputs + # │ └── file.txt + # └── raw_outputs + # ├── folderdata + # │ └── relative_path + # │ └── file.txt + # └── singlefile + # └── file.txt - # ? Possible to reset db here to make pks reproducible? - assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' - assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' + dump_parent_path = tmp_path / 'cj-dump-test-io' + process_dumper = ProcessDumper() + calculation_node = generate_calculation_node_io() + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) + + assert (dump_parent_path / raw_inputs_relpath / filename).is_file() + assert (dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / node_inputs_relpath / folderdata_test_path / filename).is_file() + assert (dump_parent_path / raw_outputs_relpath / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / raw_outputs_relpath / folderdata_test_path / filename).is_file() + + # Check contents once + with open(dump_parent_path / raw_inputs_relpath / filename, 'r') as handle: + assert handle.read() == filecontent + with open(dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename) as handle: + assert handle.read() == filecontent + with open(dump_parent_path / node_inputs_relpath / folderdata_test_path / filename) as handle: + assert handle.read() == filecontent + with open(dump_parent_path / raw_outputs_relpath / singlefiledata_linklabel / filename) as handle: + assert handle.read() == filecontent + with open(dump_parent_path / raw_outputs_relpath / folderdata_test_path / filename) as handle: + assert handle.read() == filecontent - # todo: test for io_function? +# ? Could probably be removed when the mapping is tested properly +def test_dump_calculation_custom(tmp_path, generate_calculation_node_io): + # Normal dumping -> node_inputs and not flat; custom paths provided + # Expected tree: + # cj-dump-test-io + # ├── extra_inputs_ + # │ ├── folderdata + # │ │ └── relative_path + # │ │ └── file.txt + # │ └── singlefile + # │ └── file.txt + # ├── raw_inputs_ + # │ └── file.txt + # └── raw_outputs_ + # ├── folderdata + # │ └── relative_path + # │ └── file.txt + # └── singlefile + # └── file.txt + + dump_parent_path = tmp_path / 'cj-dump-test-custom' + process_dumper = ProcessDumper() + calculation_node = generate_calculation_node_io() + dump_parent_path = tmp_path / 'cj-dump-test-custom' + process_dumper._dump_calculation( + calculation_node=calculation_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths + ) + + assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() + assert (dump_parent_path / custom_dump_paths[1] / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / custom_dump_paths[1] / folderdata_test_path / filename).is_file() + assert (dump_parent_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / custom_dump_paths[2] / folderdata_test_path / filename).is_file() -def test_generate_node_input_label( - generate_multiply_add_node, generate_calcjob_node_io, generate_workchain_node_io, aiida_localhost -): - # Check with manually constructed, more complex workchain - cj_node = generate_calcjob_node_io(attach_outputs=False) - wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) - wc_output_triples = wc_node.base.links.get_outgoing().all() - sub_wc_node = wc_output_triples[0].node - output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() +def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): + # Flat dumping -> no paths provided -> Default paths should not be existent. + # Internal FolderData structure retained. + # Expected tree: + # cj-dump-test-io + # ├── file.txt + # └── relative_path + # └── file.txt - output_labels = sorted([generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)]) - assert output_labels == ['00-sub_workchain', '01-calcjob_0'] + dump_parent_path = tmp_path / 'cj-dump-test-custom' + process_dumper = ProcessDumper(flat=True) + calculation_node = generate_calculation_node_io() + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - # Check with multiply_add workchain node - multiply_add_node = generate_multiply_add_node(computer=aiida_localhost) - output_triples = multiply_add_node.base.links.get_outgoing().all() - output_labels = sorted([generate_node_input_label(_, output_node) for _, output_node in enumerate(output_triples)]) - assert output_labels == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] + # Here, the same file will be written by raw_inputs and raw_outputs and node_inputs + # So it should only be present in the parent dump directory + assert not (dump_parent_path / raw_inputs_relpath).is_dir() + assert not (dump_parent_path / node_inputs_relpath).is_dir() + assert not (dump_parent_path / raw_outputs_relpath).is_dir() + assert (dump_parent_path / filename).is_file() + assert (dump_parent_path / folderdata_relpath / filename).is_file() -def test_validate_make_dump_path(chdir_tmp_path, tmp_path): +def test_dump_calculation_overwrite(tmp_path, generate_calculation_node_io): + dump_parent_path = tmp_path / 'cj-dump-test-overwrite' + process_dumper = ProcessDumper(overwrite=False) + calculation_node = generate_calculation_node_io() + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) + with pytest.raises(FileExistsError): + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) + + +def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): + dump_parent_path = tmp_path / 'cj-dump-test-noinputs' + process_dumper = ProcessDumper(include_node_inputs=False) + calculation_node = generate_calculation_node_io() + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) + assert not (dump_parent_path / node_inputs_relpath).is_dir() + + +def test_dump_calculation_all_aiida_nodes(tmp_path, generate_calculation_node_io): + dump_parent_path = tmp_path / 'cj-dump-test-allaiidanodes' + process_dumper = ProcessDumper(all_aiida_nodes=True) + calculation_node = generate_calculation_node_io() + process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) + assert (dump_parent_path / node_inputs_relpath / '.aiida_nodes' / arraydata_linklabel / 'default.npy').is_file() + + +def test_dump_calculation_add(tmp_path, generate_calculation_node_add): + dump_parent_path = tmp_path / 'cj-dump-test-add' + process_dumper = ProcessDumper(all_aiida_nodes=True) + calculation_node_add = generate_calculation_node_add() + process_dumper._dump_calculation(calculation_node=calculation_node_add, output_path=dump_parent_path) + + raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + raw_input_files = [dump_parent_path / raw_inputs_relpath / raw_input_file for raw_input_file in raw_input_files] + raw_output_files = [ + dump_parent_path / raw_outputs_relpath / raw_output_file for raw_output_file in raw_output_files + ] + + assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) + assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + + +# Tests for helper methods +def test_validate_make_dump_path(chdir_tmp_path, tmp_path): chdir_tmp_path test_dir = Path('test-dir') test_dir_abs = tmp_path / test_dir - safeguard_file = '.aiida_node_metadata.yaml' + safeguard_file = node_metadata_file # Path must be provided + process_dumper = ProcessDumper() with pytest.raises(TypeError): - validate_make_dump_path() + process_dumper.validate_make_dump_path() # Check if path created if non-existent - output_path = validate_make_dump_path(path=test_dir) + output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) assert output_path == test_dir_abs clean_tmp_path(tmp_path=tmp_path) # Empty path is fine -> No error and full path returned test_dir_abs.mkdir() - output_path = validate_make_dump_path(path=test_dir) + output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) assert output_path == test_dir_abs clean_tmp_path(tmp_path=tmp_path) @@ -455,16 +464,17 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): test_dir_abs.mkdir() (test_dir_abs / filename).touch() with pytest.raises(FileExistsError): - output_path = validate_make_dump_path(path=test_dir) + output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() clean_tmp_path(tmp_path=tmp_path) + process_dumper = ProcessDumper(overwrite=True) # Fails if directory not empty and overwrite set to True, but safeguard_file not found (for safety reasons) test_dir_abs.mkdir() (test_dir_abs / filename).touch() with pytest.raises(FileExistsError): - output_path = validate_make_dump_path(path=test_dir, overwrite=True) + output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() clean_tmp_path(tmp_path=tmp_path) @@ -472,10 +482,150 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): # Works if directory not empty, but overwrite=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained test_dir_abs.mkdir() (test_dir_abs / safeguard_file).touch() - output_path = validate_make_dump_path(path=test_dir, overwrite=True, safeguard_file=safeguard_file) + output_path = process_dumper.validate_make_dump_path(validate_path=test_dir, safeguard_file=safeguard_file) assert output_path == test_dir_abs assert not (test_dir_abs / safeguard_file).is_file() -def test_dump_yaml(): - assert False +def test_generate_default_dump_path( + generate_calculation_node_add, + generate_workchain_multiply_add, +): + process_dumper = ProcessDumper() + add_node = generate_calculation_node_add() + multiply_add_node = generate_workchain_multiply_add() + add_path = process_dumper.generate_default_dump_path(process_node=add_node) + multiply_add_path = process_dumper.generate_default_dump_path(process_node=multiply_add_node) + + assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' + assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' + + +def test_generate_calculation_io_mapping(): + process_dumper = ProcessDumper() + + calculation_io_mapping = process_dumper.generate_calculation_io_mapping() + assert calculation_io_mapping.repository == 'raw_inputs' + assert calculation_io_mapping.inputs == 'extra_inputs' + assert calculation_io_mapping.outputs == 'raw_outputs' + + calculation_io_mapping = process_dumper.generate_calculation_io_mapping(io_dump_paths=custom_dump_paths) + assert calculation_io_mapping.repository == 'raw_inputs_' + assert calculation_io_mapping.inputs == 'extra_inputs_' + assert calculation_io_mapping.outputs == 'raw_outputs_' + + +def test_generate_link_triple_dump_path(generate_calculation_node_io, generate_workchain_node_io, tmp_path): + # Need to construct WorkChain, as the path naming is based on `LinkTriple`s + cj_node = generate_calculation_node_io(attach_outputs=False) + wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) + wc_output_triples = wc_node.base.links.get_outgoing().all() + sub_wc_node = wc_output_triples[0].node + output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() + + process_dumper = ProcessDumper() + + output_paths = [ + tmp_path / process_dumper.generate_link_triple_dump_path(link_triple=output_triple, parent_path=tmp_path) + for output_triple in output_triples + ] + # 'sub_workflow' doesn't have a repository, so it is placed under '.aiida_nodes' + assert output_paths == [tmp_path / 'sub_workflow', tmp_path / 'calculation'] + + +def test_generate_child_node_label( + generate_workchain_multiply_add, generate_calculation_node_io, generate_workchain_node_io +): + # Check with manually constructed, more complex workchain + cj_node = generate_calculation_node_io(attach_outputs=False) + wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) + wc_output_triples = wc_node.base.links.get_outgoing().all() + sub_wc_node = wc_output_triples[0].node + + output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() + + process_dumper = ProcessDumper() + + output_paths = sorted( + [ + process_dumper.generate_child_node_label(index, output_node) + for index, output_node in enumerate(output_triples) + ] + ) + assert output_paths == ['00-sub_workflow', '01-calculation'] + + # Check with multiply_add workchain node + multiply_add_node = generate_workchain_multiply_add() + output_triples = multiply_add_node.base.links.get_outgoing().all() + output_paths = sorted( + [ + process_dumper.generate_child_node_label(_, output_node) + for _, output_node in enumerate(output_triples) + ] + ) + print(output_paths) + assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] + + +def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workchain_multiply_add): + + process_dumper = ProcessDumper() + cj_node = generate_calculation_node_io(attach_outputs=False) + process_dumper.dump_node_yaml(process_node=cj_node, output_path=tmp_path) + + assert (tmp_path / node_metadata_file).is_file() + + # Test with multiply_add + wc_node = generate_workchain_multiply_add() + process_dumper.dump_node_yaml(process_node=wc_node, output_path=tmp_path) + + assert (tmp_path / node_metadata_file).is_file() + + # Open the dumped YAML file and read its contents + with open(tmp_path / node_metadata_file, 'r') as dumped_file: + contents = dumped_file.read() + + # Check if contents as expected + assert 'Node data:' in contents + assert 'User data:' in contents + # Computer is None for the locally run MultiplyAdd + assert 'Computer data:' not in contents + assert 'Node attributes:' in contents + assert 'Node extras:' in contents + + process_dumper = ProcessDumper(include_attributes=False, include_extras=False) + + process_dumper.dump_node_yaml(process_node=wc_node, output_path=tmp_path) + + # Open the dumped YAML file and read its contents + with open(tmp_path / node_metadata_file, 'r') as dumped_file: + contents = dumped_file.read() + + # Check if contents as expected -> No attributes and extras + assert 'Node data:' in contents + assert 'User data:' in contents + # Computer is None for the locally run MultiplyAdd + assert 'Computer data:' not in contents + assert 'Node attributes:' not in contents + assert 'Node extras:' not in contents + + +def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): + + wc_node = generate_workchain_multiply_add() + process_dumper = ProcessDumper(parent_process=wc_node, parent_path=tmp_path) + + process_dumper.generate_parent_readme() + + assert (tmp_path / 'README').is_file() + + with open(tmp_path / 'README', 'r') as dumped_file: + contents = dumped_file.read() + + assert 'This directory contains' in contents + assert '`MultiplyAddWorkChain' in contents + assert 'ArithmeticAddCalculation' in contents + # Check for outputs of `verdi process status/report/show` + assert 'Finished [0] [3:result]' in contents + assert 'Property Value' in contents + assert 'No log messages' in contents From fc3c181e24a64c8a27e35cc1a64a960e74e050c0 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 29 Apr 2024 16:51:36 +0200 Subject: [PATCH 17/30] Add documentation in `How to work with data` --- docs/source/howto/data.rst | 64 +++++++++++++++++++++++++++ tests/tools/dumping/test_processes.py | 37 ++++++---------- 2 files changed, 78 insertions(+), 23 deletions(-) diff --git a/docs/source/howto/data.rst b/docs/source/howto/data.rst index 504192cfda..e3bfcd3051 100644 --- a/docs/source/howto/data.rst +++ b/docs/source/howto/data.rst @@ -78,6 +78,70 @@ Ways to find and retrieve data that have previously been imported are described If none of the currently available data types, as listed by ``verdi plugin list``, seem to fit your needs, you can also create your own custom type. For details refer to the next section :ref:`"How to add support for custom data types"`. +.. _how-to:data:dump: + +Dumping data to disk +-------------------- + +.. versionadded:: 2.6 + +It is now possible to dump your executed workflows to disk in a hierarchical directory tree structure. This can be +particularly useful if one is not yet familiar with the ``QueryBuilder`` or wants to quickly explore input/output files +using existing shell scripts or common terminal utilities, such as ``grep``. The dumping can be achieved with the command: + +.. code-block:: shell + + verdi process dump + +For our beloved ``MultiplyAddWorkChain``, we obtain the following: + +.. code-block:: shell + + $ verdi process dump -p dump-multiply_add + Success: Raw files for WorkChainNode dumped successfully in `dump-multiply_add`. + +.. code-block:: shell + + $ tree -a dump-multiply_add + dump-multiply_add + ├── README + ├── .aiida_node_metadata.yaml + ├── 01-multiply + │ ├── .aiida_node_metadata.yaml + │ └── raw_inputs + │ └── source_file + └── 02-ArithmeticAddCalculation + ├── .aiida_node_metadata.yaml + ├── raw_inputs + │ ├── .aiida + │ │ ├── calcinfo.json + │ │ └── job_tmpl.json + │ ├── _aiidasubmit.sh + │ └── aiida.in + └── raw_outputs + ├── _scheduler-stderr.txt + ├── _scheduler-stdout.txt + └── aiida.out + +The ``README`` file provides a description of the directory structure, as well as useful information about the top-level +process. Further, numbered subdirectories are created for each step of the workflow, resulting in the ``01-multiply`` +and ``02-ArithmeticAddCalculation`` folders. The raw calculation input and output files ``aiida.in`` and ``aiida.out`` +of the ``ArithmeticAddCalculation`` are placed in ``raw_inputs`` and ``raw_outputs``. In addition, these also contain +the submission script ``_aiidasubmit.sh``, as well as the scheduler stdout and stderr, ``_scheduler-stdout.txt`` and +``_scheduler-stderr.txt``, respectively. Lastly, the source code of the ``multiply```` ``calcfunction`` presenting the +first step of the workflow is contained in the ``source_file``. + +Upon having a closer look at the directory, we also find the hidden ``.aiida_node_metadata.yaml`` files, which are +created for every ``ProcessNode`` and contain additional information about the ``Node``, the ``User``, and the +``Computer``, as well as the ``.aiida`` subdirectory with machine-readable AiiDA-internal data in JSON format. + +Since subprocesses are explored recursively, arbitrarily complex, nested workflows can be dumped. As already seen above, +the ``-p`` flag allows to specify a custom dumping path. If none is provided, it is automatically generated from the +``process_label`` (or ``process_type``) and the ``pk``. In addition, the command provides the ``-o`` flag to overwrite +existing directories, the ``-a`` flag to dump further, non file-based AiiDA nodes (in hidden, ``.aiida_nodes`` +subdirectories), and the ``--include-inputs`` (``--exclude-exputs``) flags to also dump additional node inputs of each +``CalculationNode`` of the workflow. For a full list of available options, call :code:`verdi process dump --help`. +Happy dumping! .. _how-to:data:import:provenance: diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index 0f4c188fa5..bb46ede57e 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -55,7 +55,7 @@ def clean_tmp_path(tmp_path: Path): item.unlink() -# Helper methods to generate the actual `WorkChain`s and `CalcJob`s used for testing +# Helper functions to generate the actual `WorkChain`s and `CalcJob`s used for testing @pytest.fixture def generate_calculation_node_io(generate_calculation_node): def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): @@ -66,7 +66,7 @@ def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) # ? Use instance for folderdata folderdata = FolderData() - folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=folderdata_relpath / filename) + folderdata.put_object_from_filelike(handle=io.StringIO(filecontent), path=str(folderdata_relpath / filename)) # type: ignore[arg-type] arraydata_input = ArrayData(arrays=np.ones(3)) # Create calculation inputs, outputs @@ -74,7 +74,6 @@ def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs singlefiledata_linklabel: singlefiledata_input, folderdata_linklabel: folderdata, arraydata_linklabel: arraydata_input, - # todo: also add some of the other AiiDA nodes, like Int here } singlefiledata_output = singlefiledata_input.clone() @@ -84,7 +83,6 @@ def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs calculation_outputs = { folderdata_linklabel: folderdata_output, singlefiledata_linklabel: singlefiledata_output, - # todo: also add some of the other AiiDA nodes, like Int here } else: calculation_outputs = None @@ -127,7 +125,6 @@ def _generate_workchain_node_io(cj_nodes, store_all: bool = True): # Tests for entry-point dump method def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - # Expected tree: # wc-dump-test-io # └── 01-sub_workflow @@ -169,19 +166,18 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path def test_dump_flat(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - # Expected tree: # wc-dump-test-io-flat # └── 01-sub_workflow - # ├── 01-calculation - # │ ├── file.txt - # │ └── relative_path - # │ └── file.txt - # └── 02-calculation - # ├── default.npy - # ├── file.txt - # └── relative_path - # └── file.txt + # ├── 01-calculation + # │ ├── file.txt + # │ └── relative_path + # │ └── file.txt + # └── 02-calculation + # ├── default.npy + # ├── file.txt + # └── relative_path + # └── file.txt # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` dump_parent_path = tmp_path / 'wc-dump-test-io-flat' @@ -290,7 +286,7 @@ def test_dump_multiply_add_flat(tmp_path, generate_workchain_multiply_add): # Tests for dump_calculation method def test_dump_calculation_node(tmp_path, generate_calculation_node_io): - # Checking the actual content should be handled by `test_copy_tree` + # Checking the actual content should be handled by `test_copy_tree` # Normal dumping -> node_inputs and not flat; no paths provided # Expected tree: # cj-dump-test-io @@ -558,17 +554,13 @@ def test_generate_child_node_label( multiply_add_node = generate_workchain_multiply_add() output_triples = multiply_add_node.base.links.get_outgoing().all() output_paths = sorted( - [ - process_dumper.generate_child_node_label(_, output_node) - for _, output_node in enumerate(output_triples) - ] + [process_dumper.generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] ) print(output_paths) assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workchain_multiply_add): - process_dumper = ProcessDumper() cj_node = generate_calculation_node_io(attach_outputs=False) process_dumper.dump_node_yaml(process_node=cj_node, output_path=tmp_path) @@ -611,7 +603,6 @@ def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workcha def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): - wc_node = generate_workchain_multiply_add() process_dumper = ProcessDumper(parent_process=wc_node, parent_path=tmp_path) @@ -621,7 +612,7 @@ def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): with open(tmp_path / 'README', 'r') as dumped_file: contents = dumped_file.read() - + assert 'This directory contains' in contents assert '`MultiplyAddWorkChain' in contents assert 'ArithmeticAddCalculation' in contents From b5a34a9ca3d9d68f9dd8553169b7d1835f842fc0 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 29 Apr 2024 17:05:31 +0200 Subject: [PATCH 18/30] Fix check for `CalculationNode` in `dump`. --- src/aiida/tools/dumping/processes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 0b7cf09f94..839173a3e3 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -90,7 +90,7 @@ def dump( # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive # `dump` function called by `verdi`, then I need to dump for the `CalcFunction` here already, as well. self.dump_node_yaml(process_node=process_node, output_path=output_path) - if isinstance(process_node, CalcFunctionNode): + if isinstance(process_node, CalculationNode): self._dump_calculation( calculation_node=process_node, output_path=output_path, From b5e1a4eb9a292a8c39f2fee1af275258a721c154 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 15:05:56 +0000 Subject: [PATCH 19/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/cmdline/commands/test_process.py | 2 +- tests/conftest.py | 4 +--- tests/test_conftest.py | 5 +++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index fa22358edf..8553e6a613 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -17,12 +17,12 @@ import pytest from aiida import get_profile from aiida.cmdline.commands import cmd_process +from aiida.cmdline.utils.echo import ExitCode from aiida.common.links import LinkType from aiida.common.log import LOG_LEVEL_REPORT from aiida.engine import Process, ProcessState from aiida.engine.processes import control as process_control from aiida.orm import CalcJobNode, Group, WorkChainNode, WorkflowNode, WorkFunctionNode -from aiida.cmdline.utils.echo import ExitCode from tests.utils.processes import WaitProcess diff --git a/tests/conftest.py b/tests/conftest.py index db31d71750..e8b1cd57d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,6 @@ import copy import dataclasses -import io import os import pathlib import types @@ -28,7 +27,6 @@ from aiida import get_profile from aiida.common.links import LinkType from aiida.manage.configuration import Profile, get_config, load_profile -from aiida.orm import FolderData, SinglefileData if t.TYPE_CHECKING: from aiida.manage.configuration.config import Config @@ -206,7 +204,6 @@ def _generate_calculation_node( if outputs is not None: calculation_node.store() for output_label, output_node in outputs.items(): - output_node.base.links.add_incoming( calculation_node, link_type=LinkType.CREATE, link_label=output_label ) @@ -714,6 +711,7 @@ def reset_log_level(): # todo: Provide option to manually construct the nodes or actually submit the processes # todo: Depending on how long this takes, replace this duplicated code around the code base with these fixtures + @pytest.fixture def generate_calculation_node_add(aiida_localhost): # todo: Benchmark how long running this takes vs. manually constructing it diff --git a/tests/test_conftest.py b/tests/test_conftest.py index a5cecb406b..c62056fe1e 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -77,6 +77,7 @@ def test_entry_points_add_and_load(entry_points): with pytest.raises(RuntimeError, match='inline function was called'): entry_point() -# ? + +# ? def test_generate_calculation_node(): - pass \ No newline at end of file + pass From 5464a71d7a9dbd1ecf8bef5614691ec28e6e547d Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Mon, 29 Apr 2024 18:25:25 +0200 Subject: [PATCH 20/30] Fix annotations for 3.9 test suite --- tests/tools/dumping/test_processes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index bb46ede57e..de0cfafe2c 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -13,6 +13,8 @@ # ? However, when one passes tmp_dir as output_path, no automatic path is created, as not at the default value is set, # ? so str(output_path) == '.' is False +from __future__ import annotations + import io import shutil from pathlib import Path From 07ac0e18b75049fcca0fe2644e371438888cb267 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 2 May 2024 12:27:45 +0200 Subject: [PATCH 21/30] Fix `dump_node_yaml` before `CalcJob` dump --- src/aiida/tools/dumping/processes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 839173a3e3..837df0b6d1 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -89,13 +89,13 @@ def dump( # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive # `dump` function called by `verdi`, then I need to dump for the `CalcFunction` here already, as well. - self.dump_node_yaml(process_node=process_node, output_path=output_path) if isinstance(process_node, CalculationNode): self._dump_calculation( calculation_node=process_node, output_path=output_path, io_dump_paths=io_dump_paths, ) + self.dump_node_yaml(process_node=process_node, output_path=output_path) elif isinstance(process_node, WorkflowNode): called_links = process_node.base.links.get_outgoing( From a62f73c5f42f49b5220fbfcdaac7a62f68928e73 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 7 May 2024 09:20:56 +0200 Subject: [PATCH 22/30] Fix failing test due to missing metadata YAML --- src/aiida/tools/dumping/processes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 837df0b6d1..bef2813855 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -98,6 +98,9 @@ def dump( self.dump_node_yaml(process_node=process_node, output_path=output_path) elif isinstance(process_node, WorkflowNode): + + self.dump_node_yaml(process_node=process_node, output_path=output_path) + called_links = process_node.base.links.get_outgoing( link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK) ).all() From 8db2e053dec44e68a2702d8a3e317bd152f22a53 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 07:21:16 +0000 Subject: [PATCH 23/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/aiida/tools/dumping/processes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index bef2813855..3594336b06 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -98,7 +98,6 @@ def dump( self.dump_node_yaml(process_node=process_node, output_path=output_path) elif isinstance(process_node, WorkflowNode): - self.dump_node_yaml(process_node=process_node, output_path=output_path) called_links = process_node.base.links.get_outgoing( From 1cbe414a5341f4e0c67145493e417e4e70797f33 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Wed, 15 May 2024 11:34:15 +0200 Subject: [PATCH 24/30] Final cleanup --- docs/source/howto/data.rst | 4 +- docs/source/reference/command_line.rst | 2 +- src/aiida/cmdline/commands/cmd_process.py | 11 +- src/aiida/cmdline/params/options/main.py | 2 - src/aiida/tools/dumping/__init__.py | 2 - src/aiida/tools/dumping/processes.py | 192 ++++++++++------------ tests/conftest.py | 13 +- tests/test_conftest.py | 5 - tests/tools/dumping/test_processes.py | 111 ++++++------- 9 files changed, 151 insertions(+), 191 deletions(-) diff --git a/docs/source/howto/data.rst b/docs/source/howto/data.rst index e3bfcd3051..712c092b46 100644 --- a/docs/source/howto/data.rst +++ b/docs/source/howto/data.rst @@ -139,9 +139,9 @@ Since subprocesses are explored recursively, arbitrarily complex, nested workflo the ``-p`` flag allows to specify a custom dumping path. If none is provided, it is automatically generated from the ``process_label`` (or ``process_type``) and the ``pk``. In addition, the command provides the ``-o`` flag to overwrite existing directories, the ``-a`` flag to dump further, non file-based AiiDA nodes (in hidden, ``.aiida_nodes`` -subdirectories), and the ``--include-inputs`` (``--exclude-exputs``) flags to also dump additional node inputs of each +subdirectories), the ``-f`` flag to dump all files for each ``CalculationNode`` of the workflow in a flat directory +structure, and the ``--include-inputs`` (``--exclude-inputs``) flag to also dump additional node inputs of each ``CalculationNode`` of the workflow. For a full list of available options, call :code:`verdi process dump --help`. -Happy dumping! .. _how-to:data:import:provenance: diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index dc9187695c..9523bb1da4 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -367,7 +367,7 @@ Below is a list with all available subcommands. Commands: call-root Show root process of the call stack for the given processes. - dump Dump files involved in the execution of a process. + dump Dump files involved in the execution of one or multiple processes. kill Kill running processes. list Show a list of running or terminated processes. pause Pause running processes. diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 0e0eaf287e..edd4fb84a7 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -515,7 +515,7 @@ def process_dump( overwrite, flat, ) -> None: - """Dump files involved in the execution of a process. + """Dump files involved in the execution of one or multiple processes. Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus @@ -533,7 +533,6 @@ def process_dump( from aiida.tools.dumping.processes import ProcessDumper for process in processes: - # Generate default parent folder process_dumper = ProcessDumper( parent_process=process, include_node_inputs=include_inputs, @@ -559,17 +558,13 @@ def process_dump( try: process_dumper.dump(process_node=process, output_path=output_path) - - # ? Which exceptions do I expect here? except FileExistsError: echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') - # except NotImplementedError: - # echo.echo_critical('flat dumping not supported for `WorkChain`s that call more than one `CalcJob`.') except Exception as e: echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') - # Create README in parent directory. Do this at the end as to not cause exceptions for the path creation, and - # only do it when everything ran through fine before + # Create README in parent directory. Do this at the end as to not cause exceptions for the path creation due to + # directory not being empty, and only do it when everything ran through fine before process_dumper.generate_parent_readme() echo.echo_success( diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 74b6cf2ab1..43b2381363 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -767,7 +767,6 @@ def set_log_level(_ctx, _param, value): ) INCLUDE_ATTRIBUTES = OverridableOption( - # '-a', '--include-attributes/--exclude-attributes', default=True, show_default=True, @@ -775,7 +774,6 @@ def set_log_level(_ctx, _param, value): ) INCLUDE_EXTRAS = OverridableOption( - # '-e', '--include-extras/--exclude-extras', default=True, show_default=True, diff --git a/src/aiida/tools/dumping/__init__.py b/src/aiida/tools/dumping/__init__.py index 068e5a8291..a746fa171e 100644 --- a/src/aiida/tools/dumping/__init__.py +++ b/src/aiida/tools/dumping/__init__.py @@ -9,5 +9,3 @@ """Modules related to the dumping of AiiDA data.""" __all__ = ('processes',) - -# from .processes import * diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 3594336b06..54def99b3a 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -6,12 +6,13 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -"""Functions for dumping of workchains and calcjobs.""" +"""Functionality for dumping of WorkChains and CalcJobs.""" from __future__ import annotations import logging from pathlib import Path +from types import SimpleNamespace from typing import Any, List, Optional import yaml @@ -30,7 +31,6 @@ ) from aiida.orm.utils import LinkManager, LinkTriple -# Include RemoteData here? -> Don't think so FILE_CLASSES = (SinglefileData, FolderData) LOGGER = logging.getLogger(__name__) @@ -38,8 +38,8 @@ class ProcessDumper: def __init__( self, - parent_process: Optional[ProcessNode] = None, - parent_path: Optional[Path] = None, + parent_process: ProcessNode | None = None, + parent_path: Path | None = None, include_node_inputs: bool = True, include_attributes: bool = True, include_extras: bool = True, @@ -56,27 +56,29 @@ def __init__( self.all_aiida_nodes = all_aiida_nodes # Automatically determine parent_path on instantiation if `parent_process` is set - if parent_path is None and parent_process is not None: - self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) - elif parent_path is not None: + if parent_path is not None: self.parent_path = parent_path + elif parent_process is not None: + self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) + else: + self.parent_path = Path() + # process_node changes during recursion -> distinction from parent_process of ProcessDumper def dump( self, - process_node, # ? This changes, so it should not be defined as a class variable + process_node, output_path: Path | None, io_dump_paths: list | None = None, - ): - """Dumps all data involved in a `WorkChainNode`, including its outgoing links. + ) -> None: + """Dumps all data involved in a `ProcessNode`, including its outgoing links. - Note that if an outgoing link is again a `WorkChainNode`, the function recursively calls itself, while files are - only actually created when a `CalcJobNode` is reached. + Note that if an outgoing link is a `WorkflowNode`, the function recursively calls itself, while files are + only actually created when a `CalculationNode` is reached. - :param process_node: The parent process node to be dumped. It can be either `WorkChainNode` or `CalcJobNode` - :param output_path: The main output path where the directory tree will be created. - :param include_inputs: If True, include file or folder inputs in the dump. Defaults to True. - :param node_dumper: The ProcessNodeYamlDumper instance to use for dumping node metadata. If not provided, a new - instance will be created. Defaults to None. + :param process_node: The parent `ProcessNode` node to be dumped. + :param output_path: The output path where the directory tree will be created. + :param io_dump_paths: Subdirectories created for each `CalculationNode`. + Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] """ if output_path is None: @@ -88,16 +90,17 @@ def dump( raise # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive - # `dump` function called by `verdi`, then I need to dump for the `CalcFunction` here already, as well. + # `dump` function called by `verdi`, then I need to dump for the `CalculationNode` here already, as well. if isinstance(process_node, CalculationNode): self._dump_calculation( calculation_node=process_node, output_path=output_path, io_dump_paths=io_dump_paths, ) - self.dump_node_yaml(process_node=process_node, output_path=output_path) + # Don't actually to call `self.dump_node_yaml` here, as this is done inside `self._dump_calculation` elif isinstance(process_node, WorkflowNode): + # Can call `self.dump_node_yaml` here before the actual dump, as file dumping happens in child directories self.dump_node_yaml(process_node=process_node, output_path=output_path) called_links = process_node.base.links.get_outgoing( @@ -116,6 +119,7 @@ def dump( self.dump( process_node=child_node, output_path=child_output_path, + io_dump_paths=io_dump_paths, ) # Once a `CalculationNode` as child reached, dump it @@ -123,6 +127,7 @@ def dump( self._dump_calculation( calculation_node=child_node, output_path=child_output_path, + io_dump_paths=io_dump_paths, ) def _dump_calculation( @@ -130,14 +135,13 @@ def _dump_calculation( calculation_node: CalculationNode, output_path: Path | None, io_dump_paths: list | None = None, - ): - """ - Dump the contents of a CalcJobNode to a specified output path. + ) -> None: + """Dump the contents of a `CalculationNode` to a specified output path. - :param calcjob_node: The CalcJobNode to be dumped. - :param output_path: The path where the dumped contents will be stored. - :param include_inputs: If True, do not dump the inputs of the CalcJobNode. - :return: None + :param calculation_node: The `CalculationNode` to be dumped. + :param output_path: The path where the files will be dumped. + :param io_dump_paths: Subdirectories created for the `CalculationNode`. + Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] """ if output_path is None: @@ -154,40 +158,35 @@ def _dump_calculation( io_dump_mapping = self.generate_calculation_io_mapping(io_dump_paths=io_dump_paths) # Dump the repository contents of the node - # ? Rename this to node_repository or something -> Introduces AiiDA terminology.But as we provide the option to - # ? dump *all* the outputs, we should also provide the option to dump *all* the inputs, not just `node_inputs` calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_mapping.repository) # Dump the extra_inputs if self.include_node_inputs: - input_node_triples = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) - self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, node_triples=input_node_triples) + input_link_manager = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) + self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_manager=input_link_manager) # Dump the raw_outputs - output_node_triples = calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE) + output_link_manager = calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE) self._dump_calculation_io( parent_path=output_path / io_dump_mapping.outputs, - node_triples=output_node_triples, + link_manager=output_link_manager, ) - def _dump_calculation_io(self, parent_path: Path, node_triples: LinkManager): - # if exception_paths is None: - # exception_paths = ['/pseudos/'] - - for node_triple in node_triples: - node = node_triple.node + def _dump_calculation_io(self, parent_path: Path, link_manager: LinkManager): + for link_triple in link_manager: + node = link_triple.node if isinstance(node, FILE_CLASSES) or any(issubclass(type(node), cls) for cls in FILE_CLASSES): file_node_path = self.generate_link_triple_dump_path( - link_triple=node_triple, + link_triple=link_triple, parent_path=parent_path, ) - # No .resolve() required as that done in `generate_calcjob_input_node_path` - node_triple.node.base.repository.copy_tree(file_node_path) + # No .resolve() required as that done in `generate_link_triple_dump_path` + link_triple.node.base.repository.copy_tree(file_node_path) else: aiida_node_path = self.generate_link_triple_dump_path( - link_triple=node_triple, + link_triple=link_triple, parent_path=parent_path / '.aiida_nodes', ) @@ -198,16 +197,17 @@ def _dump_calculation_io(self, parent_path: Path, node_triples: LinkManager): # So if anybody has a better solution, I'd be happy to use that # The problem might be void, though, once all the atomistic code is moved to `aiida-atomistic` if node.node_type == 'data.pseudo.upf.UpfData.': - node_triple.node.base.repository.copy_tree(Path(str(aiida_node_path).replace('.aiida_nodes', ''))) + link_triple.node.base.repository.copy_tree(Path(str(aiida_node_path).replace('.aiida_nodes', ''))) elif self.all_aiida_nodes: - node_triple.node.base.repository.copy_tree(aiida_node_path) + link_triple.node.base.repository.copy_tree(aiida_node_path) def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: - """ - Create default dumping directory for a given process node and return it as absolute path. + """Create default dumping directory for a given process node and return it as absolute path. - :param path: The base path for the dump. Defaults to the current directory. - :return: The created dump path. + :param validate_path: Path to validate for dumping. + :param safeguard_file: Dumping-specific file to avoid deleting wrong directory. + Default: `.aiida_node_metadata.yaml` + :return: The absolute created dump path. """ import shutil @@ -245,12 +245,13 @@ def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.a def generate_default_dump_path(self, process_node: ProcessNode | None) -> Path: """Simple helper function to generate the default parent-dumping directory if none given. - This function is not called for the sub-calls of `calcjob_node_dump` or during the recursive `process_dump` as - it just creates the default parent folder for the dumping, if no name is given. + This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default + parent folder for the dumping, if no name is given. :param process_node: The `ProcessNode` for which the directory is created. - :return: The created parent dump path. + :return: The absolute default parent dump path. """ + if process_node is None: raise TypeError('`process_node` must be provided for generating the default path.') else: @@ -258,16 +259,19 @@ def generate_default_dump_path(self, process_node: ProcessNode | None) -> Path: try: return Path(f'dump-{process_node.process_label}-{pk}') except AttributeError: - # ? This case came up during testing, not sure how relevant it actually is + # This case came up during testing, not sure how relevant it actually is return Path(f'dump-{process_node.process_type}-{pk}') - def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None): - # Could turn this into a dict/mapping and use as labels what the entities actually refer to - # Don't use AiiDA terminology directly as it might be confusing for other users who are mainly targeted for the - # dumping - # ? Could move this outside of class and just pass flat, and not set the logger as a class attribute + def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None) -> SimpleNamespace: + """Helper function to generate mapping for entities dumped for each `CalculationNode`. + + This is to avoid exposing AiiDA terminology, like `repository` to the user, while keeping track of which + entities should be dumped into which directory, and allowing for alternative directory names. - from types import SimpleNamespace + :param io_dump_paths: Subdirectories created for the `CalculationNode`. + Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] + :return: SimpleNamespace mapping. + """ aiida_entities_to_dump = ['repository', 'inputs', 'outputs'] default_calculation_io_dump_paths = ['raw_inputs', 'extra_inputs', 'raw_outputs'] @@ -288,7 +292,7 @@ def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = N elif self.flat and io_dump_paths is not None: LOGGER.info( - 'Flat set to True but `io_dump_paths` provided. These will be used, but `node_inputs` not nested.' + 'Flat set to True but `io_dump_paths` provided. These will be used, but `extra_inputs` not nested.' ) return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: @@ -298,31 +302,17 @@ def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = N return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] def generate_link_triple_dump_path(self, link_triple: LinkTriple, parent_path: Path) -> Path: + """Create subdirectory paths for the `CalculationNode` linked I/O Nodes. + + :param link_triple: LinkTriple representing I/O nodes of `CalculationNode`. + :param parent_path: Parent directory for `CalculationNode` dump. + :return: The absolute dumping subdirectory for `CalculationNode` I/O. + """ node = link_triple.node link_label = link_triple.link_label # For convenience, remove the 'retrieved' subdirectory for the outputs link_label = link_label.replace('retrieved', '') - # This is again QE specific... - # if exception_labels is None: - # exception_labels = ['pseudos'] - - # ? Shouldn't this be only be applied to `CalculationNode`s? - # This check is necessary, as I'm now not only dumping the 'retrieved' outputs and file-based inputs, but all of - # the connected links if self.all_aiida_nodes is True - # ? This is now handled outside, before this function is called - # if len(node.base.repository.list_objects()) > 0: - # aiida_nodes_subdir = '' - # else: - # # Empty repository, so it should be non-file-based AiiDA data types, like ArrayData - # # -> Put those into '.aiida_nodes' subdirectory - # aiida_nodes_subdir = '.aiida_nodes' - # # aiida_nodes_subdir = '' - - # ? The check if the link_label starts with pseudo is again very specific for the atomistic community/QE, - # ? however, I don't know how to otherwise avoid that it's put in `.aiida_nodes`, as the Node is defined as - # ? Data, not UpfData, so I cannot just check against FILE_NODES - # if isinstance(node, FILE_CLASSES) or any(link_label.startswith(label) for label in exception_labels): if isinstance(node, FILE_CLASSES): if not self.flat: input_node_path = parent_path / Path(*link_label.split('__')) @@ -330,38 +320,35 @@ def generate_link_triple_dump_path(self, link_triple: LinkTriple, parent_path: P # Don't use link_label at all -> But, relative path inside FolderData is retained input_node_path = parent_path elif not self.flat: - # input_node_path = parent_path / aiida_nodes_subdir / Path(*link_label.split('__')) input_node_path = parent_path / Path(*link_label.split('__')) else: # Don't use link_label at all -> But, relative path inside FolderData is retained - # input_node_path = parent_path / aiida_nodes_subdir input_node_path = parent_path return input_node_path.resolve() def generate_child_node_label(self, index: int, link_triple: LinkTriple) -> str: - """Small helper function to generate the directory label for node inputs.""" + """Small helper function to generate and clean directory label for child nodes during recursion. + + :param index: Index assigned to step at current level of recursion. + :param link_triple: `LinkTriple` of `ProcessNode` explored during recursion. + :return: Chlild node label during recursion. + """ node = link_triple.node link_label = link_triple.link_label # Generate directories with naming scheme akin to `verdi process status` - # node_label = f'{index:02d}-{link_label}' label_list = [f'{index:02d}', link_label] try: process_label = node.process_label if process_label is not None and process_label != link_label: label_list += [process_label] - # node_label += f'-{process_label}' except AttributeError: process_type = node.process_type if process_type is not None and process_type != link_label: label_list += [process_type] - # node_label += f'-{process_type}' - - # if isinstance(node, File): - # label_list += [node.name] node_label = '-'.join(label_list) # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove @@ -376,12 +363,11 @@ def dump_node_yaml( output_path: Path, output_filename: str = '.aiida_node_metadata.yaml', ) -> None: - """Dump the selected `ProcessNode` properties, attributes, and extras to a yaml file. + """Dump the selected `ProcessNode` properties, attributes, and extras to a YAML file. - :param process_node: The ProcessNode to dump. - :param output_path: The path to the directory where the yaml file will be saved. - :param output_filename: The name of the output yaml file. Defaults to `.aiida_node_metadata.yaml`. - :return: None + :param process_node: The `ProcessNode` to dump. + :param output_path: The path to the directory where the YAML file will be saved. + :param output_filename: The name of the output YAML file. Defaults to `.aiida_node_metadata.yaml`. """ _node_properties = [ @@ -468,19 +454,21 @@ def generate_parent_readme(self): _readme_string = textwrap.dedent( f"""\ - This directory contains the files involved in the calculation/workflow `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. + This directory contains the files involved in the calculation/workflow + `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. - Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are - contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus - mirrors the logical execution of the workflow, which can also be queried by running `verdi process status - {self.parent_process.pk}` on the command line. + Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA + jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their + creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried + by running `verdi process status {self.parent_process.pk}` on the command line. By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "extra_inputs". + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job + execution settings). Additional input files (depending on the type of calculation) are placed in the + "extra_inputs". - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA - node data for further inspection.""" # noqa: E501 + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant + AiiDA node data for further inspection.""" ) # `verdi process status` diff --git a/tests/conftest.py b/tests/conftest.py index e8b1cd57d6..9d069e14e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -185,12 +185,10 @@ def _generate_calculation_node( calculation_node.set_exit_status(exit_status) if repository is not None: - # ? Possibly allow for other types here, and use `put_object_from_filelike`, etc. - # ? Or use from_tree + # Or use from_tree? calculation_node.base.repository.put_object_from_filelike(repository[0], repository[1]) # For storing, need to first store the input nodes, then the CalculationNode, then the output nodes - if inputs is not None: for input_label, input_node in inputs.items(): calculation_node.base.links.add_incoming( @@ -202,6 +200,7 @@ def _generate_calculation_node( input_node.store() if outputs is not None: + # Need to first store CalculationNode before I can attach `created` outputs calculation_node.store() for output_label, output_node in outputs.items(): output_node.base.links.add_incoming( @@ -210,7 +209,7 @@ def _generate_calculation_node( output_node.store() - # ? Should it be stored here + # Return unstored by default return calculation_node return _generate_calculation_node @@ -708,13 +707,8 @@ def reset_log_level(): log.configure_logging(with_orm=True) -# todo: Provide option to manually construct the nodes or actually submit the processes -# todo: Depending on how long this takes, replace this duplicated code around the code base with these fixtures - - @pytest.fixture def generate_calculation_node_add(aiida_localhost): - # todo: Benchmark how long running this takes vs. manually constructing it def _generate_calculation_node_add(): from aiida.engine import run_get_node from aiida.orm import InstalledCode, Int @@ -737,7 +731,6 @@ def _generate_calculation_node_add(): @pytest.fixture def generate_workchain_multiply_add(aiida_localhost): - # todo: Benchmark how long running this takes vs. manually constructing it def _generate_workchain_multiply_add(): from aiida.engine import run_get_node from aiida.orm import InstalledCode, Int diff --git a/tests/test_conftest.py b/tests/test_conftest.py index c62056fe1e..22d990d323 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -76,8 +76,3 @@ def test_entry_points_add_and_load(entry_points): with pytest.raises(RuntimeError, match='inline function was called'): entry_point() - - -# ? -def test_generate_calculation_node(): - pass diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index de0cfafe2c..c0f89991b7 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -8,10 +8,9 @@ ########################################################################### """Tests for the dumping of ProcessNode data to disk.""" -# ? For testing the dumping, one either needs to cd into the tmp_path, or pass the tmp_path as argument, otherwise, the -# ? files are dumped into src -> CWD from where the script is run. -# ? However, when one passes tmp_dir as output_path, no automatic path is created, as not at the default value is set, -# ? so str(output_path) == '.' is False +# For testing the dumping, one either needs to cd into the tmp_path, or pass the tmp_path as argument, otherwise, the +# files are dumped into the src -> CWD from where the script is run. +# However, when one passes tmp_dir as output_path, no automatic, default path is created from __future__ import annotations @@ -23,7 +22,7 @@ from aiida.common.links import LinkType from aiida.tools.dumping.processes import ProcessDumper -# ? Non-AiiDA variables +# Non-AiiDA variables filename = 'file.txt' filecontent = 'a' raw_inputs_relpath = Path('raw_inputs') @@ -32,7 +31,7 @@ default_dump_paths = [raw_inputs_relpath, node_inputs_relpath, raw_outputs_relpath] custom_dump_paths = [f'{path}_' for path in default_dump_paths] -# Define some variables used for constructing the nodes used to test the dumping +# Define variables used for constructing the nodes used to test the dumping singlefiledata_linklabel = 'singlefile' folderdata_linklabel = 'folderdata' folderdata_relpath = Path('relative_path') @@ -40,15 +39,10 @@ arraydata_linklabel = 'arraydata' node_metadata_file = '.aiida_node_metadata.yaml' -# todo: Test for _LOGGER.info outputs - -# ? Move this somewhere else? +# Move this somewhere else? def clean_tmp_path(tmp_path: Path): - """ - Recursively delete files and directories in a path, e.g. a temporary path used by pytest. - # ? This empties the directory, as intended for the general dumping directory, but doesn't delete it itself - """ + """Recursively delete files and directories in a path, e.g. a temporary path used by pytest.""" for item in tmp_path.iterdir(): if item.is_dir(): @@ -57,7 +51,7 @@ def clean_tmp_path(tmp_path: Path): item.unlink() -# Helper functions to generate the actual `WorkChain`s and `CalcJob`s used for testing +# Helper functions to generate the actual `WorkflowNode`s and `CalculationNode`s used for testing @pytest.fixture def generate_calculation_node_io(generate_calculation_node): def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): @@ -130,7 +124,16 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path # Expected tree: # wc-dump-test-io # └── 01-sub_workflow - # └── 01-calculation + # ├── 01-calculation + # │ ├── extra_inputs + # │ │ ├── folderdata + # │ │ │ └── relative_path + # │ │ │ └── file.txt + # │ │ └── singlefile + # │ │ └── file.txt + # │ └── raw_inputs + # │ └── file.txt + # └── 02-calculation # ├── extra_inputs # │ ├── folderdata # │ │ └── relative_path @@ -144,13 +147,12 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path dump_parent_path = tmp_path / 'wc-dump-test-io' process_dumper = ProcessDumper() # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain - # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) # Don't test for `README` here, as this is only created when dumping is done via `verdi` - # But, check for aiida_node_metadata.yaml + # But, check for .aiida_node_metadata.yaml raw_input_path = '01-sub_workflow/01-calculation/raw_inputs/file.txt' singlefiledata_path = '01-sub_workflow/01-calculation/extra_inputs/singlefile/file.txt' folderdata_path = '01-sub_workflow/01-calculation/extra_inputs/folderdata/relative_path/file.txt' @@ -166,32 +168,26 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path assert all([expected_file.is_file() for expected_file in expected_files]) + # clean_tmp_path(dump_parent_path) + + # Flat dumping -def test_dump_flat(generate_calculation_node_io, generate_workchain_node_io, tmp_path): # Expected tree: # wc-dump-test-io-flat # └── 01-sub_workflow - # ├── 01-calculation - # │ ├── file.txt - # │ └── relative_path - # │ └── file.txt - # └── 02-calculation - # ├── default.npy - # ├── file.txt - # └── relative_path - # └── file.txt + # ├── 01-calculation + # │ ├── file.txt + # │ └── relative_path + # │ └── file.txt + # └── 02-calculation + # ├── file.txt + # └── relative_path + # └── file.txt - # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` dump_parent_path = tmp_path / 'wc-dump-test-io-flat' process_dumper = ProcessDumper(flat=True) - # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain - # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list - cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] - wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - # Don't test for `README` here, as this is only created when dumping is done via `verdi` - # But, check for aiida_node_metadata.yaml raw_input_path = '01-sub_workflow/01-calculation/file.txt' folderdata_path = '01-sub_workflow/01-calculation/relative_path/file.txt' node_metadata_paths = [ @@ -208,27 +204,22 @@ def test_dump_flat(generate_calculation_node_io, generate_workchain_node_io, tmp def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): - # Dump with flat=False # Expected tree: - # multiply_add-dump-test + # wc-dump-test-multiply-add # ├── 01-multiply # │ └── raw_inputs # │ └── source_file # └── 02-ArithmeticAddCalculation - # ├── raw_inputs - # │ ├── .aiida - # │ │ ├── calcinfo.json - # │ │ └── job_tmpl.json - # │ ├── _aiidasubmit.sh - # │ └── aiida.in - # └── raw_outputs - # ├── _scheduler-stderr.txt - # ├── _scheduler-stdout.txt - # └── aiida.out + # ├── raw_inputs + # │ ├── _aiidasubmit.sh + # │ └── aiida.in + # └── raw_outputs + # ├── _scheduler-stderr.txt + # ├── _scheduler-stdout.txt + # └── aiida.out + dump_parent_path = tmp_path / 'wc-dump-test-multiply-add' process_dumper = ProcessDumper() - # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain - # Need to generate two instances rather than [...] * 2, otherwise same instance twice in list wc_node = generate_workchain_multiply_add() process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) @@ -243,15 +234,16 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): dump_parent_path / '02-ArithmeticAddCalculation' / raw_outputs_relpath / raw_output_file for raw_output_file in raw_output_files ] + # No node_inputs contained in MultiplyAddWorkChain assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + # Flat dumping -def test_dump_multiply_add_flat(tmp_path, generate_workchain_multiply_add): # Expected tree: - # cj-dump-test-add + # wc-dump-test-multiply-add-flat # ├── 01-multiply # │ └── source_file # └── 02-ArithmeticAddCalculation @@ -261,10 +253,9 @@ def test_dump_multiply_add_flat(tmp_path, generate_workchain_multiply_add): # ├── aiida.in # └── aiida.out - dump_parent_path = tmp_path / 'cj-dump-test-add' + dump_parent_path = tmp_path / 'wc-dump-test-multiply-add-flat' process_dumper = ProcessDumper(flat=True) - calculation_node_add = generate_workchain_multiply_add() - process_dumper.dump(process_node=calculation_node_add, output_path=dump_parent_path) + process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) multiply_file = dump_parent_path / '01-multiply' / 'source_file' arithmetic_add_files = [ @@ -276,14 +267,13 @@ def test_dump_multiply_add_flat(tmp_path, generate_workchain_multiply_add): '_scheduler-stdout.txt', 'aiida.out', ] - # raw_input_files += ['source_file'] arithmetic_add_files = [ dump_parent_path / '02-ArithmeticAddCalculation' / arithmetic_add_file for arithmetic_add_file in arithmetic_add_files ] - assert all([expected_file.is_file() for expected_file in arithmetic_add_files]) assert multiply_file.is_file() + assert all([expected_file.is_file() for expected_file in arithmetic_add_files]) # Tests for dump_calculation method @@ -331,7 +321,7 @@ def test_dump_calculation_node(tmp_path, generate_calculation_node_io): assert handle.read() == filecontent -# ? Could probably be removed when the mapping is tested properly +# Could probably be removed with proper testing of the mapping def test_dump_calculation_custom(tmp_path, generate_calculation_node_io): # Normal dumping -> node_inputs and not flat; custom paths provided # Expected tree: @@ -354,7 +344,6 @@ def test_dump_calculation_custom(tmp_path, generate_calculation_node_io): dump_parent_path = tmp_path / 'cj-dump-test-custom' process_dumper = ProcessDumper() calculation_node = generate_calculation_node_io() - dump_parent_path = tmp_path / 'cj-dump-test-custom' process_dumper._dump_calculation( calculation_node=calculation_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths ) @@ -389,6 +378,8 @@ def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): assert (dump_parent_path / folderdata_relpath / filename).is_file() +# Here, in principle, test only non-default arguments, as defaults tested above +# @pytest.mark.parametrize('overwrite', (True, False)) def test_dump_calculation_overwrite(tmp_path, generate_calculation_node_io): dump_parent_path = tmp_path / 'cj-dump-test-overwrite' process_dumper = ProcessDumper(overwrite=False) @@ -501,7 +492,6 @@ def test_generate_default_dump_path( def test_generate_calculation_io_mapping(): process_dumper = ProcessDumper() - calculation_io_mapping = process_dumper.generate_calculation_io_mapping() assert calculation_io_mapping.repository == 'raw_inputs' assert calculation_io_mapping.inputs == 'extra_inputs' @@ -541,6 +531,8 @@ def test_generate_child_node_label( sub_wc_node = wc_output_triples[0].node output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() + # Sort by mtime here, not ctime, as I'm actually creating the CalculationNode first. + output_triples = sorted(output_triples, key=lambda link_triple: link_triple.node.mtime) process_dumper = ProcessDumper() @@ -555,10 +547,11 @@ def test_generate_child_node_label( # Check with multiply_add workchain node multiply_add_node = generate_workchain_multiply_add() output_triples = multiply_add_node.base.links.get_outgoing().all() + # Sort by ctime here, not mtime, as I'm generating the WorkChain normally + output_triples = sorted(output_triples, key=lambda link_triple: link_triple.node.ctime) output_paths = sorted( [process_dumper.generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] ) - print(output_paths) assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] From ae9a912aac58e1b2cd20615f45df8c6f9dc31293 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 16 May 2024 18:13:43 +0200 Subject: [PATCH 25/30] Add `_workflow_dump` and change link node dumping Moved the recursive logic out of the top-level `dump` function instead into `_workflow_dump`. In addition, moved the default path creation and validation into the top-level `dump` function and out of the `cmd_process.py` file. The following entities are now dumped for each child `CalculationNode` reached during the dumping: - `CalculationNode` repository -> `inputs` - `CalculationNode` retrieved output -> `outputs` - `CalculationNode` input nodes -> `node_inputs` - `CalculationNode` output nodes (apart from `retrieved`) -> `node_outputs` By default, everything apart from the `node_outputs` is dumped, as to avoid too many non-`SinglefileData` or `FolderData` nodes to be written to disk. The `--all-aiida-nodes` option is instead removed. The number of files might still grow large for complex workchains, e.g. `SelfConsistentHubbardWorkchain` or `EquationOfStateWorkChain`. In addition, set `_generate_default_dump_path`, `_generate_readme`, and `_generate_child_node_label` as `staticmethod`s, as they logically belong to the class, but don't access any of its attributes. The former two are further only called in the top-level `dump` method. Other methods like `_validate_make_dump_path` still access class attributes like `overwrite` or `flat`, so they remain normal class methods. --- docs/source/reference/command_line.rst | 2 +- src/aiida/cmdline/commands/cmd_process.py | 82 ++-- src/aiida/cmdline/params/options/main.py | 21 +- src/aiida/tools/dumping/processes.py | 462 ++++++++++------------ tests/tools/dumping/test_processes.py | 52 +-- 5 files changed, 275 insertions(+), 344 deletions(-) diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst index 9523bb1da4..3553f953dd 100644 --- a/docs/source/reference/command_line.rst +++ b/docs/source/reference/command_line.rst @@ -367,7 +367,7 @@ Below is a list with all available subcommands. Commands: call-root Show root process of the call stack for the given processes. - dump Dump files involved in the execution of one or multiple processes. + dump Dump process input and output files to disk. kill Kill running processes. list Show a list of running or terminated processes. pause Pause running processes. diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index edd4fb84a7..6c5d60c9e5 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -484,38 +484,25 @@ def process_repair(manager, broker, dry_run): @verdi_process.command('dump') -@arguments.PROCESSES() +@arguments.PROCESS() @options.PATH() -@click.option( - '-f', - '--flat', - 'flat', - is_flag=True, - default=False, - help='Dump files in a flat directory for every step of the workflow.', -) -@click.option( - '-a', - '--all-aiida-nodes', - is_flag=True, - default=False, - help='Dump also non file-based AiiDA nodes. This can generate quite a lot of files, so use with caution.', -) @options.OVERWRITE() @options.INCLUDE_INPUTS() +@options.INCLUDE_OUTPUTS() @options.INCLUDE_ATTRIBUTES() @options.INCLUDE_EXTRAS() +@options.FLAT() def process_dump( - processes, + process, path, include_inputs, + include_outputs, include_attributes, include_extras, - all_aiida_nodes, overwrite, flat, ) -> None: - """Dump files involved in the execution of one or multiple processes. + """Dump process input and output files to disk. Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus @@ -524,7 +511,7 @@ def process_dump( By default, input and output files of each calculation can be found in the corresponding "raw_inputs" and "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "extra_inputs". + settings). Additional input files (depending on the type of calculation) are placed in the "inputs". Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection. @@ -532,41 +519,24 @@ def process_dump( from aiida.tools.dumping.processes import ProcessDumper - for process in processes: - process_dumper = ProcessDumper( - parent_process=process, - include_node_inputs=include_inputs, - include_attributes=include_attributes, - include_extras=include_extras, - all_aiida_nodes=all_aiida_nodes, - overwrite=overwrite, - flat=flat, - ) - - if path is None: - output_path = process_dumper.generate_default_dump_path(process_node=process) - else: - output_path = path.resolve() - - process_dumper.parent_path = output_path - - # Capture `FileExistsError` here already, not by trying to run the dumping - try: - process_dumper.validate_make_dump_path(validate_path=output_path) - except FileExistsError: - echo.echo_critical(f'Path `{output_path}` already exists and overwrite set to False.') + process_dumper = ProcessDumper( + include_inputs=include_inputs, + include_outputs=include_outputs, + include_attributes=include_attributes, + include_extras=include_extras, + overwrite=overwrite, + flat=flat, + ) - try: - process_dumper.dump(process_node=process, output_path=output_path) - except FileExistsError: - echo.echo_critical('Some files present in the dumping directory. Delete manually and try again.') - except Exception as e: - echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') - - # Create README in parent directory. Do this at the end as to not cause exceptions for the path creation due to - # directory not being empty, and only do it when everything ran through fine before - process_dumper.generate_parent_readme() - - echo.echo_success( - f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{output_path}`.' + try: + dump_path = process_dumper.dump(process_node=process, output_path=path) + except FileExistsError: + echo.echo_critical( + 'Dumping directory exists and overwrite is False. ' 'Set overwrite to True, or delete directory manually.' ) + except Exception as e: + echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') + + echo.echo_success( + f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{dump_path}`.' + ) diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index 43b2381363..be9dce4cff 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -60,6 +60,7 @@ 'EXIT_STATUS', 'EXPORT_FORMAT', 'FAILED', + 'FLAT', 'FORCE', 'FORMULA_MODE', 'FREQUENCY', @@ -70,6 +71,7 @@ 'IDENTIFIER', 'INCLUDE_ATTRIBUTES', 'INCLUDE_INPUTS', + 'INCLUDE_OUTPUTS', 'INCLUDE_EXTRAS', 'INPUT_FORMAT', 'INPUT_PLUGIN', @@ -763,7 +765,14 @@ def set_log_level(_ctx, _param, value): '--include-inputs/--exclude-inputs', default=True, show_default=True, - help='Include the input nodes of the `CalcJob`(s).', + help='Include the linked input nodes of the `CalculationNode`(s).', +) + +INCLUDE_OUTPUTS = OverridableOption( + '--include-outputs/--exclude-outputs', + default=False, + show_default=True, + help='Include the linked output nodes of the `CalculationNode`(s).', ) INCLUDE_ATTRIBUTES = OverridableOption( @@ -786,5 +795,13 @@ def set_log_level(_ctx, _param, value): is_flag=True, default=False, show_default=True, - help="""Overwrite directory if it already exists.""", + help='Overwrite directory if it already exists.', +) + +FLAT = OverridableOption( + '-f', + '--flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of the workflow.', ) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 54def99b3a..1b47807f18 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -18,58 +18,155 @@ import yaml from aiida.common import LinkType +from aiida.common.exceptions import NotExistentAttributeError from aiida.orm import ( CalcFunctionNode, CalcJobNode, CalculationNode, - FolderData, ProcessNode, - SinglefileData, WorkChainNode, WorkflowNode, WorkFunctionNode, ) -from aiida.orm.utils import LinkManager, LinkTriple +from aiida.orm.utils import LinkTriple -FILE_CLASSES = (SinglefileData, FolderData) LOGGER = logging.getLogger(__name__) class ProcessDumper: def __init__( self, - parent_process: ProcessNode | None = None, - parent_path: Path | None = None, - include_node_inputs: bool = True, + include_inputs: bool = True, + include_outputs: bool = False, include_attributes: bool = True, include_extras: bool = True, overwrite: bool = True, flat: bool = False, - all_aiida_nodes: bool = False, ) -> None: - self.parent_process = parent_process - self.include_node_inputs = include_node_inputs + self.include_inputs = include_inputs + self.include_outputs = include_outputs self.include_attributes = include_attributes self.include_extras = include_extras self.overwrite = overwrite self.flat = flat - self.all_aiida_nodes = all_aiida_nodes - # Automatically determine parent_path on instantiation if `parent_process` is set - if parent_path is not None: - self.parent_path = parent_path - elif parent_process is not None: - self.parent_path = self.generate_default_dump_path(process_node=self.parent_process) + @staticmethod + def _generate_default_dump_path(process_node: ProcessNode) -> Path: + """Simple helper function to generate the default parent-dumping directory if none given. + + This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default + parent folder for the dumping, if no name is given. + + :param process_node: The `ProcessNode` for which the directory is created. + :return: The absolute default parent dump path. + """ + + pk = process_node.pk + try: + return Path(f'dump-{process_node.process_label}-{pk}') + except AttributeError: + # This case came up during testing, not sure how relevant it actually is + return Path(f'dump-{process_node.process_type}-{pk}') + + @staticmethod + def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: + """Generate README file in main dumping directory. + + :param process_node: `CalculationNode` or `WorkflowNode`. + :param output_path: Output path for dumping. + + """ + + import textwrap + + from aiida.cmdline.utils.ascii_vis import format_call_graph + from aiida.cmdline.utils.common import ( + get_calcjob_report, + get_node_info, + get_process_function_report, + get_workchain_report, + ) + + _readme_string = textwrap.dedent( + f"""\ + This directory contains the files involved in the calculation/workflow + `{process_node.process_label} <{process_node.pk}>` run with AiiDA. + + Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA + jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their + creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried + by running `verdi process status {process_node.pk}` on the command line. + + By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and + "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job + execution settings). Additional input files (depending on the type of calculation) are placed in the + "inputs". + + Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant + AiiDA node data for further inspection.""" + ) + + # `verdi process status` + process_status = format_call_graph(calc_node=process_node, max_depth=None, call_link_label=True) + _readme_string += f'\n\n\nOutput of `verdi process status {process_node.pk}:`\n\n{process_status}' + + # `verdi process report` + # Copied over from `cmd_process` + if isinstance(process_node, CalcJobNode): + process_report = get_calcjob_report(process_node) + elif isinstance(process_node, WorkChainNode): + process_report = get_workchain_report(process_node, levelname='REPORT', indent_size=2, max_depth=None) + elif isinstance(process_node, (CalcFunctionNode, WorkFunctionNode)): + process_report = get_process_function_report(process_node) else: - self.parent_path = Path() + process_report = f'Nothing to show for node type {process_node.__class__}' + + _readme_string += f'\n\n\nOutput of `verdi process report {process_node.pk}`:\n\n{process_report}' + + # `verdi process show`? + process_show = get_node_info(node=process_node) + _readme_string += f'\n\n\nOutput of `verdi process show {process_node.pk}`:\n\n{process_show}' + + with (output_path / 'README').open('w') as handle: + handle.write(_readme_string) + + @staticmethod + def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: + """Small helper function to generate and clean directory label for child nodes during recursion. + + :param index: Index assigned to step at current level of recursion. + :param link_triple: `LinkTriple` of `ProcessNode` explored during recursion. + :return: Chlild node label during recursion. + """ + node = link_triple.node + link_label = link_triple.link_label + + # Generate directories with naming scheme akin to `verdi process status` + label_list = [f'{index:02d}', link_label] + + try: + process_label = node.process_label + if process_label is not None and process_label != link_label: + label_list += [process_label] + + except AttributeError: + process_type = node.process_type + if process_type is not None and process_type != link_label: + label_list += [process_type] + + node_label = '-'.join(label_list) + # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove + node_label = node_label.replace('CALL-', '') + node_label = node_label.replace('None-', '') + + return node_label - # process_node changes during recursion -> distinction from parent_process of ProcessDumper def dump( self, - process_node, + process_node: ProcessNode, output_path: Path | None, io_dump_paths: list | None = None, - ) -> None: + ) -> Path: """Dumps all data involved in a `ProcessNode`, including its outgoing links. Note that if an outgoing link is a `WorkflowNode`, the function recursively calls itself, while files are @@ -78,62 +175,66 @@ def dump( :param process_node: The parent `ProcessNode` node to be dumped. :param output_path: The output path where the directory tree will be created. :param io_dump_paths: Subdirectories created for each `CalculationNode`. - Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] + Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] """ if output_path is None: - output_path = self.generate_default_dump_path(process_node=process_node) + output_path = self._generate_default_dump_path(process_node=process_node) - try: - self.validate_make_dump_path(validate_path=output_path) - except: - raise + self._validate_make_dump_path(validate_path=output_path) - # This seems a bit duplicated, but if the logic for checking the types should be contained in the recursive - # `dump` function called by `verdi`, then I need to dump for the `CalculationNode` here already, as well. if isinstance(process_node, CalculationNode): self._dump_calculation( calculation_node=process_node, output_path=output_path, io_dump_paths=io_dump_paths, ) - # Don't actually to call `self.dump_node_yaml` here, as this is done inside `self._dump_calculation` elif isinstance(process_node, WorkflowNode): - # Can call `self.dump_node_yaml` here before the actual dump, as file dumping happens in child directories - self.dump_node_yaml(process_node=process_node, output_path=output_path) - - called_links = process_node.base.links.get_outgoing( - link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK) - ).all() - - sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) - - for index, link_triple in enumerate(sorted_called_links, start=1): - child_node = link_triple.node - child_label = self.generate_child_node_label(index=index, link_triple=link_triple) - child_output_path = output_path.resolve() / child_label - - # Recursive function call for `WorkFlowNode`` - if isinstance(child_node, WorkflowNode): - self.dump( - process_node=child_node, - output_path=child_output_path, - io_dump_paths=io_dump_paths, - ) - - # Once a `CalculationNode` as child reached, dump it - elif isinstance(child_node, CalculationNode): - self._dump_calculation( - calculation_node=child_node, - output_path=child_output_path, - io_dump_paths=io_dump_paths, - ) + self._dump_workflow( + workflow_node=process_node, + output_path=output_path, + io_dump_paths=io_dump_paths, + ) + + self._generate_readme(process_node=process_node, output_path=output_path) + + return output_path + + def _dump_workflow(self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: list | None = None) -> None: + self._validate_make_dump_path(validate_path=output_path) + + self._dump_node_yaml(process_node=workflow_node, output_path=output_path) + + called_links = workflow_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + + sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) + + for index, link_triple in enumerate(sorted_called_links, start=1): + child_node = link_triple.node + child_label = self._generate_child_node_label(index=index, link_triple=link_triple) + child_output_path = output_path.resolve() / child_label + + # Recursive function call for `WorkFlowNode`` + if isinstance(child_node, WorkflowNode): + self._dump_workflow( + workflow_node=child_node, + output_path=child_output_path, + io_dump_paths=io_dump_paths, + ) + + # Once a `CalculationNode` as child reached, dump it + elif isinstance(child_node, CalculationNode): + self._dump_calculation( + calculation_node=child_node, + output_path=child_output_path, + io_dump_paths=io_dump_paths, + ) def _dump_calculation( self, calculation_node: CalculationNode, - output_path: Path | None, + output_path: Path, io_dump_paths: list | None = None, ) -> None: """Dump the contents of a `CalculationNode` to a specified output path. @@ -141,67 +242,55 @@ def _dump_calculation( :param calculation_node: The `CalculationNode` to be dumped. :param output_path: The path where the files will be dumped. :param io_dump_paths: Subdirectories created for the `CalculationNode`. - Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] + Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] """ - if output_path is None: - output_path = self.generate_default_dump_path(process_node=calculation_node) - - try: - self.validate_make_dump_path(validate_path=output_path) - except: - # raise same exception here to communicate it outwards - raise + self._validate_make_dump_path(validate_path=output_path) - self.dump_node_yaml(process_node=calculation_node, output_path=output_path) + self._dump_node_yaml(process_node=calculation_node, output_path=output_path) - io_dump_mapping = self.generate_calculation_io_mapping(io_dump_paths=io_dump_paths) + io_dump_mapping = self._generate_calculation_io_mapping(io_dump_paths=io_dump_paths) # Dump the repository contents of the node calculation_node.base.repository.copy_tree(output_path.resolve() / io_dump_mapping.repository) - # Dump the extra_inputs - if self.include_node_inputs: - input_link_manager = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) - self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_manager=input_link_manager) + # Dump the repository contents of `outputs.retrieved` + try: + calculation_node.outputs.retrieved.base.repository.copy_tree( + output_path.resolve() / io_dump_mapping.retrieved + ) + except NotExistentAttributeError: + pass - # Dump the raw_outputs - output_link_manager = calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE) - self._dump_calculation_io( - parent_path=output_path / io_dump_mapping.outputs, - link_manager=output_link_manager, - ) + # Dump the node_inputs + if self.include_inputs: + input_links = list(calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC)) - def _dump_calculation_io(self, parent_path: Path, link_manager: LinkManager): - for link_triple in link_manager: - node = link_triple.node - if isinstance(node, FILE_CLASSES) or any(issubclass(type(node), cls) for cls in FILE_CLASSES): - file_node_path = self.generate_link_triple_dump_path( - link_triple=link_triple, - parent_path=parent_path, - ) + self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) - # No .resolve() required as that done in `generate_link_triple_dump_path` - link_triple.node.base.repository.copy_tree(file_node_path) + # Dump the node_outputs apart from `retrieved` + if self.include_outputs: + output_links = list(calculation_node.base.links.get_outgoing(link_type=LinkType.CREATE)) + output_links = [output_link for output_link in output_links if output_link.link_label != 'retrieved'] + + self._dump_calculation_io( + parent_path=output_path / io_dump_mapping.outputs, + link_triples=output_links, + ) + def _dump_calculation_io(self, parent_path: Path, link_triples: list): + for link_triple in link_triples: + link_label = link_triple.link_label + + if not self.flat: + linked_node_path = parent_path / Path(*link_label.split('__')) else: - aiida_node_path = self.generate_link_triple_dump_path( - link_triple=link_triple, - parent_path=parent_path / '.aiida_nodes', - ) + # Don't use link_label at all -> But, relative path inside FolderData is retained + linked_node_path = parent_path + + link_triple.node.base.repository.copy_tree(linked_node_path.resolve()) - # This is again QE specific, but, frankly, I don't know how to otherwise separate the pseudos from the - # rest of the AiiDA-nodes, as the pseudos are of `Data` type (why not UpfData?), so I cannot distinguish - # them from other AiiDA-nodes, such as ArrayData which we definitely want in the hidden `.aiida_nodes` - # subdirectory - # So if anybody has a better solution, I'd be happy to use that - # The problem might be void, though, once all the atomistic code is moved to `aiida-atomistic` - if node.node_type == 'data.pseudo.upf.UpfData.': - link_triple.node.base.repository.copy_tree(Path(str(aiida_node_path).replace('.aiida_nodes', ''))) - elif self.all_aiida_nodes: - link_triple.node.base.repository.copy_tree(aiida_node_path) - - def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: + def _validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.aiida_node_metadata.yaml') -> Path: """Create default dumping directory for a given process node and return it as absolute path. :param validate_path: Path to validate for dumping. @@ -231,7 +320,7 @@ def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.a # Check for safeguard file ('.aiida_node_metadata.yaml') for safety # If absent -> Don't remove directory as to not accidentally remove a wrong one else: - raise FileExistsError( + raise Exception( f"Path `{validate_path}` already exists and doesn't contain safeguard file {safeguard_file}." f'Not removing for safety reasons.' ) @@ -242,40 +331,20 @@ def validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '.a return validate_path.resolve() - def generate_default_dump_path(self, process_node: ProcessNode | None) -> Path: - """Simple helper function to generate the default parent-dumping directory if none given. - - This function is not called for the recursive sub-calls of `_dump_calculation` as it just creates the default - parent folder for the dumping, if no name is given. - - :param process_node: The `ProcessNode` for which the directory is created. - :return: The absolute default parent dump path. - """ - - if process_node is None: - raise TypeError('`process_node` must be provided for generating the default path.') - else: - pk = process_node.pk - try: - return Path(f'dump-{process_node.process_label}-{pk}') - except AttributeError: - # This case came up during testing, not sure how relevant it actually is - return Path(f'dump-{process_node.process_type}-{pk}') - - def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None) -> SimpleNamespace: + def _generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None) -> SimpleNamespace: """Helper function to generate mapping for entities dumped for each `CalculationNode`. This is to avoid exposing AiiDA terminology, like `repository` to the user, while keeping track of which entities should be dumped into which directory, and allowing for alternative directory names. :param io_dump_paths: Subdirectories created for the `CalculationNode`. - Default: ['raw_inputs', 'extra_inputs', 'raw_outputs'] + Default: ['inputs', 'outputs', 'node_inputs', 'node_outputs'] :return: SimpleNamespace mapping. """ - aiida_entities_to_dump = ['repository', 'inputs', 'outputs'] - default_calculation_io_dump_paths = ['raw_inputs', 'extra_inputs', 'raw_outputs'] - empty_calculation_io_dump_paths = [''] * 3 + aiida_entities_to_dump = ['repository', 'retrieved', 'inputs', 'outputs'] + default_calculation_io_dump_paths = ['inputs', 'outputs', 'node_inputs', 'node_outputs'] + empty_calculation_io_dump_paths = [''] * 4 if self.flat and io_dump_paths is None: LOGGER.info( @@ -291,9 +360,7 @@ def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = N return SimpleNamespace(**dict(zip(aiida_entities_to_dump, default_calculation_io_dump_paths))) elif self.flat and io_dump_paths is not None: - LOGGER.info( - 'Flat set to True but `io_dump_paths` provided. These will be used, but `extra_inputs` not nested.' - ) + LOGGER.info('Flat set to True but `io_dump_paths` provided. These will be used, but `inputs` not nested.') return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) else: LOGGER.info( @@ -301,63 +368,7 @@ def generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = N ) return SimpleNamespace(**dict(zip(aiida_entities_to_dump, io_dump_paths))) # type: ignore[arg-type] - def generate_link_triple_dump_path(self, link_triple: LinkTriple, parent_path: Path) -> Path: - """Create subdirectory paths for the `CalculationNode` linked I/O Nodes. - - :param link_triple: LinkTriple representing I/O nodes of `CalculationNode`. - :param parent_path: Parent directory for `CalculationNode` dump. - :return: The absolute dumping subdirectory for `CalculationNode` I/O. - """ - node = link_triple.node - link_label = link_triple.link_label - # For convenience, remove the 'retrieved' subdirectory for the outputs - link_label = link_label.replace('retrieved', '') - - if isinstance(node, FILE_CLASSES): - if not self.flat: - input_node_path = parent_path / Path(*link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - input_node_path = parent_path - elif not self.flat: - input_node_path = parent_path / Path(*link_label.split('__')) - else: - # Don't use link_label at all -> But, relative path inside FolderData is retained - input_node_path = parent_path - - return input_node_path.resolve() - - def generate_child_node_label(self, index: int, link_triple: LinkTriple) -> str: - """Small helper function to generate and clean directory label for child nodes during recursion. - - :param index: Index assigned to step at current level of recursion. - :param link_triple: `LinkTriple` of `ProcessNode` explored during recursion. - :return: Chlild node label during recursion. - """ - node = link_triple.node - link_label = link_triple.link_label - - # Generate directories with naming scheme akin to `verdi process status` - label_list = [f'{index:02d}', link_label] - - try: - process_label = node.process_label - if process_label is not None and process_label != link_label: - label_list += [process_label] - - except AttributeError: - process_type = node.process_type - if process_type is not None and process_type != link_label: - label_list += [process_type] - - node_label = '-'.join(label_list) - # `CALL-` as part of the link labels also for MultiplyAddWorkChain -> Seems general enough, so remove - node_label = node_label.replace('CALL-', '') - node_label = node_label.replace('None-', '') - - return node_label - - def dump_node_yaml( + def _dump_node_yaml( self, process_node: ProcessNode, output_path: Path, @@ -429,70 +440,3 @@ def dump_node_yaml( output_file = output_path.resolve() / output_filename with open(output_file, 'w') as handle: yaml.dump(node_dict, handle, sort_keys=False) - - # ? Add type hints here? Would require loading from ORM in header of `cmd_` file -> Might fail CLI time validation - def generate_parent_readme(self): - """Generate README file in main dumping directory. - - :param process_node: CalcJob or WorkChain Node. - :param output_path: Output path for dumping. - - """ - - import textwrap - - from aiida.cmdline.utils.ascii_vis import format_call_graph - from aiida.cmdline.utils.common import ( - get_calcjob_report, - get_node_info, - get_process_function_report, - get_workchain_report, - ) - - if self.parent_process is None or self.parent_path is None: - raise TypeError('parent_process and parent_path must be set before README can be created.') - - _readme_string = textwrap.dedent( - f"""\ - This directory contains the files involved in the calculation/workflow - `{self.parent_process.process_label} <{self.parent_process.pk}>` run with AiiDA. - - Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA - jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their - creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried - by running `verdi process status {self.parent_process.pk}` on the command line. - - By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job - execution settings). Additional input files (depending on the type of calculation) are placed in the - "extra_inputs". - - Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant - AiiDA node data for further inspection.""" - ) - - # `verdi process status` - process_status = format_call_graph(calc_node=self.parent_process, max_depth=None, call_link_label=True) - _readme_string += f'\n\n\nOutput of `verdi process status {self.parent_process.pk}:`\n\n{process_status}' - - # `verdi process report` - # Copied over from `cmd_process` - if isinstance(self.parent_process, CalcJobNode): - process_report = get_calcjob_report(self.parent_process) - elif isinstance(self.parent_process, WorkChainNode): - process_report = get_workchain_report( - self.parent_process, levelname='REPORT', indent_size=2, max_depth=None - ) - elif isinstance(self.parent_process, (CalcFunctionNode, WorkFunctionNode)): - process_report = get_process_function_report(self.parent_process) - else: - process_report = f'Nothing to show for node type {self.parent_process.__class__}' - - _readme_string += f'\n\n\nOutput of `verdi process report {self.parent_process.pk}`:\n\n{process_report}' - - # `verdi process show`? - process_show = get_node_info(node=self.parent_process) - _readme_string += f'\n\n\nOutput of `verdi process show {self.parent_process.pk}`:\n\n{process_show}' - - with (self.parent_path / 'README').open('w') as handle: - handle.write(_readme_string) diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index c0f89991b7..f364113b82 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -27,7 +27,7 @@ filecontent = 'a' raw_inputs_relpath = Path('raw_inputs') raw_outputs_relpath = Path('raw_outputs') -node_inputs_relpath = Path('extra_inputs') +node_inputs_relpath = Path('inputs') default_dump_paths = [raw_inputs_relpath, node_inputs_relpath, raw_outputs_relpath] custom_dump_paths = [f'{path}_' for path in default_dump_paths] @@ -125,7 +125,7 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path # wc-dump-test-io # └── 01-sub_workflow # ├── 01-calculation - # │ ├── extra_inputs + # │ ├── inputs # │ │ ├── folderdata # │ │ │ └── relative_path # │ │ │ └── file.txt @@ -134,7 +134,7 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path # │ └── raw_inputs # │ └── file.txt # └── 02-calculation - # ├── extra_inputs + # ├── inputs # │ ├── folderdata # │ │ └── relative_path # │ │ └── file.txt @@ -154,8 +154,8 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path # Don't test for `README` here, as this is only created when dumping is done via `verdi` # But, check for .aiida_node_metadata.yaml raw_input_path = '01-sub_workflow/01-calculation/raw_inputs/file.txt' - singlefiledata_path = '01-sub_workflow/01-calculation/extra_inputs/singlefile/file.txt' - folderdata_path = '01-sub_workflow/01-calculation/extra_inputs/folderdata/relative_path/file.txt' + singlefiledata_path = '01-sub_workflow/01-calculation/inputs/singlefile/file.txt' + folderdata_path = '01-sub_workflow/01-calculation/inputs/folderdata/relative_path/file.txt' node_metadata_paths = [ node_metadata_file, f'01-sub_workflow/{node_metadata_file}', @@ -282,7 +282,7 @@ def test_dump_calculation_node(tmp_path, generate_calculation_node_io): # Normal dumping -> node_inputs and not flat; no paths provided # Expected tree: # cj-dump-test-io - # ├── extra_inputs + # ├── inputs # │ ├── folderdata # │ │ └── relative_path # │ │ └── file.txt @@ -326,7 +326,7 @@ def test_dump_calculation_custom(tmp_path, generate_calculation_node_io): # Normal dumping -> node_inputs and not flat; custom paths provided # Expected tree: # cj-dump-test-io - # ├── extra_inputs_ + # ├── inputs_ # │ ├── folderdata # │ │ └── relative_path # │ │ └── file.txt @@ -391,7 +391,7 @@ def test_dump_calculation_overwrite(tmp_path, generate_calculation_node_io): def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): dump_parent_path = tmp_path / 'cj-dump-test-noinputs' - process_dumper = ProcessDumper(include_node_inputs=False) + process_dumper = ProcessDumper(include_inputs=False) calculation_node = generate_calculation_node_io() process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) assert not (dump_parent_path / node_inputs_relpath).is_dir() @@ -434,17 +434,17 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): # Path must be provided process_dumper = ProcessDumper() with pytest.raises(TypeError): - process_dumper.validate_make_dump_path() + process_dumper._validate_make_dump_path() # Check if path created if non-existent - output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert output_path == test_dir_abs clean_tmp_path(tmp_path=tmp_path) # Empty path is fine -> No error and full path returned test_dir_abs.mkdir() - output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert output_path == test_dir_abs clean_tmp_path(tmp_path=tmp_path) @@ -453,7 +453,7 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): test_dir_abs.mkdir() (test_dir_abs / filename).touch() with pytest.raises(FileExistsError): - output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() clean_tmp_path(tmp_path=tmp_path) @@ -463,7 +463,7 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): test_dir_abs.mkdir() (test_dir_abs / filename).touch() with pytest.raises(FileExistsError): - output_path = process_dumper.validate_make_dump_path(validate_path=test_dir) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() clean_tmp_path(tmp_path=tmp_path) @@ -471,7 +471,7 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): # Works if directory not empty, but overwrite=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained test_dir_abs.mkdir() (test_dir_abs / safeguard_file).touch() - output_path = process_dumper.validate_make_dump_path(validate_path=test_dir, safeguard_file=safeguard_file) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir, safeguard_file=safeguard_file) assert output_path == test_dir_abs assert not (test_dir_abs / safeguard_file).is_file() @@ -483,8 +483,8 @@ def test_generate_default_dump_path( process_dumper = ProcessDumper() add_node = generate_calculation_node_add() multiply_add_node = generate_workchain_multiply_add() - add_path = process_dumper.generate_default_dump_path(process_node=add_node) - multiply_add_path = process_dumper.generate_default_dump_path(process_node=multiply_add_node) + add_path = process_dumper._generate_default_dump_path(process_node=add_node) + multiply_add_path = process_dumper._generate_default_dump_path(process_node=multiply_add_node) assert str(add_path) == f'dump-ArithmeticAddCalculation-{add_node.pk}' assert str(multiply_add_path) == f'dump-MultiplyAddWorkChain-{multiply_add_node.pk}' @@ -492,14 +492,14 @@ def test_generate_default_dump_path( def test_generate_calculation_io_mapping(): process_dumper = ProcessDumper() - calculation_io_mapping = process_dumper.generate_calculation_io_mapping() + calculation_io_mapping = process_dumper._generate_calculation_io_mapping() assert calculation_io_mapping.repository == 'raw_inputs' - assert calculation_io_mapping.inputs == 'extra_inputs' + assert calculation_io_mapping.inputs == 'inputs' assert calculation_io_mapping.outputs == 'raw_outputs' - calculation_io_mapping = process_dumper.generate_calculation_io_mapping(io_dump_paths=custom_dump_paths) + calculation_io_mapping = process_dumper._generate_calculation_io_mapping(io_dump_paths=custom_dump_paths) assert calculation_io_mapping.repository == 'raw_inputs_' - assert calculation_io_mapping.inputs == 'extra_inputs_' + assert calculation_io_mapping.inputs == 'inputs_' assert calculation_io_mapping.outputs == 'raw_outputs_' @@ -514,7 +514,7 @@ def test_generate_link_triple_dump_path(generate_calculation_node_io, generate_w process_dumper = ProcessDumper() output_paths = [ - tmp_path / process_dumper.generate_link_triple_dump_path(link_triple=output_triple, parent_path=tmp_path) + tmp_path / process_dumper._generate_link_triple_dump_path(link_triple=output_triple, parent_path=tmp_path) for output_triple in output_triples ] # 'sub_workflow' doesn't have a repository, so it is placed under '.aiida_nodes' @@ -538,7 +538,7 @@ def test_generate_child_node_label( output_paths = sorted( [ - process_dumper.generate_child_node_label(index, output_node) + process_dumper._generate_child_node_label(index, output_node) for index, output_node in enumerate(output_triples) ] ) @@ -550,7 +550,7 @@ def test_generate_child_node_label( # Sort by ctime here, not mtime, as I'm generating the WorkChain normally output_triples = sorted(output_triples, key=lambda link_triple: link_triple.node.ctime) output_paths = sorted( - [process_dumper.generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] + [process_dumper._generate_child_node_label(_, output_node) for _, output_node in enumerate(output_triples)] ) assert output_paths == ['00-multiply', '01-ArithmeticAddCalculation', '02-result'] @@ -558,13 +558,13 @@ def test_generate_child_node_label( def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workchain_multiply_add): process_dumper = ProcessDumper() cj_node = generate_calculation_node_io(attach_outputs=False) - process_dumper.dump_node_yaml(process_node=cj_node, output_path=tmp_path) + process_dumper._dump_node_yaml(process_node=cj_node, output_path=tmp_path) assert (tmp_path / node_metadata_file).is_file() # Test with multiply_add wc_node = generate_workchain_multiply_add() - process_dumper.dump_node_yaml(process_node=wc_node, output_path=tmp_path) + process_dumper._dump_node_yaml(process_node=wc_node, output_path=tmp_path) assert (tmp_path / node_metadata_file).is_file() @@ -582,7 +582,7 @@ def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workcha process_dumper = ProcessDumper(include_attributes=False, include_extras=False) - process_dumper.dump_node_yaml(process_node=wc_node, output_path=tmp_path) + process_dumper._dump_node_yaml(process_node=wc_node, output_path=tmp_path) # Open the dumped YAML file and read its contents with open(tmp_path / node_metadata_file, 'r') as dumped_file: From ac2acb4ef649b32b665e2bb192decc653e6c8e06 Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 16 May 2024 20:56:21 +0200 Subject: [PATCH 26/30] Updated tests --- src/aiida/tools/dumping/processes.py | 18 +- tests/tools/dumping/test_processes.py | 280 ++++++++------------------ 2 files changed, 95 insertions(+), 203 deletions(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 1b47807f18..3007d11ed8 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -23,6 +23,7 @@ CalcFunctionNode, CalcJobNode, CalculationNode, + LinkManager, ProcessNode, WorkChainNode, WorkflowNode, @@ -202,6 +203,12 @@ def dump( return output_path def _dump_workflow(self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: list | None = None) -> None: + """Recursive function to traverse a ``WorkflowNode`` and dump its ``CalculationNode``s. + + :param workflow_node: ``WorkflowNode`` to be traversed. Will be updated during recursion. + :param output_path: Dumping parent directory. Will be updated during recursion. + :param io_dump_paths: Custom subdirectories for ``CalculationNode``s, defaults to None + """ self._validate_make_dump_path(validate_path=output_path) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) @@ -264,8 +271,7 @@ def _dump_calculation( # Dump the node_inputs if self.include_inputs: - input_links = list(calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC)) - + input_links = calculation_node.base.links.get_incoming(link_type=LinkType.INPUT_CALC) self._dump_calculation_io(parent_path=output_path / io_dump_mapping.inputs, link_triples=input_links) # Dump the node_outputs apart from `retrieved` @@ -278,7 +284,13 @@ def _dump_calculation( link_triples=output_links, ) - def _dump_calculation_io(self, parent_path: Path, link_triples: list): + def _dump_calculation_io(self, parent_path: Path, link_triples: LinkManager | List[LinkTriple]): + """Small helper function to dump linked input/output nodes of ``CalculationNode``s. + + :param parent_path: Parent directory for dumping the linked node contents. + :param link_triples: List of link triples. + """ + for link_triple in link_triples: link_label = link_triple.link_label diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index f364113b82..ffc281f164 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -25,10 +25,11 @@ # Non-AiiDA variables filename = 'file.txt' filecontent = 'a' -raw_inputs_relpath = Path('raw_inputs') -raw_outputs_relpath = Path('raw_outputs') -node_inputs_relpath = Path('inputs') -default_dump_paths = [raw_inputs_relpath, node_inputs_relpath, raw_outputs_relpath] +inputs_relpath = Path('inputs') +outputs_relpath = Path('outputs') +node_inputs_relpath = Path('node_inputs') +node_outputs_relpath = Path('node_outputs') +default_dump_paths = [inputs_relpath, outputs_relpath, node_inputs_relpath, node_outputs_relpath] custom_dump_paths = [f'{path}_' for path in default_dump_paths] # Define variables used for constructing the nodes used to test the dumping @@ -108,6 +109,11 @@ def _generate_workchain_node_io(cj_nodes, store_all: bool = True): for cj_node in cj_nodes: cj_node.base.links.add_incoming(wc_node_sub, link_type=LinkType.CALL_CALC, link_label='calculation') + # Set process_state so that tests don't throw exception for build_call_graph of README generation + [cj_node.set_process_state('finished') for cj_node in cj_nodes] + wc_node.set_process_state('finished') + wc_node_sub.set_process_state('finished') + # Need to store so that outputs are being dumped if store_all: wc_node.store() @@ -119,43 +125,34 @@ def _generate_workchain_node_io(cj_nodes, store_all: bool = True): return _generate_workchain_node_io -# Tests for entry-point dump method +# Only test top-level actions, like path and README creation +# Other things tested via `_dump_workflow` and `_dump_calculation` def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - # Expected tree: - # wc-dump-test-io - # └── 01-sub_workflow - # ├── 01-calculation - # │ ├── inputs - # │ │ ├── folderdata - # │ │ │ └── relative_path - # │ │ │ └── file.txt - # │ │ └── singlefile - # │ │ └── file.txt - # │ └── raw_inputs - # │ └── file.txt - # └── 02-calculation - # ├── inputs - # │ ├── folderdata - # │ │ └── relative_path - # │ │ └── file.txt - # │ └── singlefile - # │ └── file.txt - # └── raw_inputs - # └── file.txt + dump_parent_path = tmp_path / 'wc-dump-test-io' + process_dumper = ProcessDumper() + # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain + cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] + wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) + return_path = process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) + assert dump_parent_path.is_dir() + assert (dump_parent_path / 'README').is_file() + assert return_path == dump_parent_path + + +def test_dump_workflow(generate_calculation_node_io, generate_workchain_node_io, tmp_path): # Need to generate parent path for dumping, as I don't want the sub-workchains to be dumped directly into `tmp_path` - dump_parent_path = tmp_path / 'wc-dump-test-io' + dump_parent_path = tmp_path / 'wc-workflow_dump-test-io' process_dumper = ProcessDumper() # Don't attach outputs, as it would require storing the calculation_node and then it cannot be used in the workchain cj_nodes = [generate_calculation_node_io(attach_outputs=False), generate_calculation_node_io(attach_outputs=False)] wc_node = generate_workchain_node_io(cj_nodes=cj_nodes) - process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) + process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - # Don't test for `README` here, as this is only created when dumping is done via `verdi` - # But, check for .aiida_node_metadata.yaml - raw_input_path = '01-sub_workflow/01-calculation/raw_inputs/file.txt' - singlefiledata_path = '01-sub_workflow/01-calculation/inputs/singlefile/file.txt' - folderdata_path = '01-sub_workflow/01-calculation/inputs/folderdata/relative_path/file.txt' + input_path = '01-sub_workflow/01-calculation/inputs/file.txt' + singlefiledata_path = '01-sub_workflow/01-calculation/node_inputs/singlefile/file.txt' + folderdata_path = '01-sub_workflow/01-calculation/node_inputs/folderdata/relative_path/file.txt' + arraydata_path = '01-sub_workflow/01-calculation/node_inputs/arraydata/default.npy' node_metadata_paths = [ node_metadata_file, f'01-sub_workflow/{node_metadata_file}', @@ -163,32 +160,18 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path f'01-sub_workflow/02-calculation/{node_metadata_file}', ] - expected_files = [raw_input_path, singlefiledata_path, folderdata_path, *node_metadata_paths] + expected_files = [input_path, singlefiledata_path, folderdata_path, arraydata_path, *node_metadata_paths] expected_files = [dump_parent_path / expected_file for expected_file in expected_files] assert all([expected_file.is_file() for expected_file in expected_files]) - # clean_tmp_path(dump_parent_path) - # Flat dumping - - # Expected tree: - # wc-dump-test-io-flat - # └── 01-sub_workflow - # ├── 01-calculation - # │ ├── file.txt - # │ └── relative_path - # │ └── file.txt - # └── 02-calculation - # ├── file.txt - # └── relative_path - # └── file.txt - dump_parent_path = tmp_path / 'wc-dump-test-io-flat' process_dumper = ProcessDumper(flat=True) - process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) + process_dumper._dump_workflow(workflow_node=wc_node, output_path=dump_parent_path) - raw_input_path = '01-sub_workflow/01-calculation/file.txt' + input_path = '01-sub_workflow/01-calculation/file.txt' + arraydata_path = '01-sub_workflow/01-calculation/default.npy' folderdata_path = '01-sub_workflow/01-calculation/relative_path/file.txt' node_metadata_paths = [ node_metadata_file, @@ -197,62 +180,34 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path f'01-sub_workflow/02-calculation/{node_metadata_file}', ] - expected_files = [raw_input_path, folderdata_path, *node_metadata_paths] + expected_files = [input_path, folderdata_path, arraydata_path, *node_metadata_paths] expected_files = [dump_parent_path / expected_file for expected_file in expected_files] assert all([expected_file.is_file() for expected_file in expected_files]) def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): - # Expected tree: - # wc-dump-test-multiply-add - # ├── 01-multiply - # │ └── raw_inputs - # │ └── source_file - # └── 02-ArithmeticAddCalculation - # ├── raw_inputs - # │ ├── _aiidasubmit.sh - # │ └── aiida.in - # └── raw_outputs - # ├── _scheduler-stderr.txt - # ├── _scheduler-stdout.txt - # └── aiida.out - dump_parent_path = tmp_path / 'wc-dump-test-multiply-add' process_dumper = ProcessDumper() wc_node = generate_workchain_multiply_add() process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) - raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - raw_input_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / raw_inputs_relpath / raw_input_file - for raw_input_file in raw_input_files + input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + input_files = [ + dump_parent_path / '02-ArithmeticAddCalculation' / inputs_relpath / input_file for input_file in input_files ] - raw_input_files += [dump_parent_path / '01-multiply' / raw_inputs_relpath / 'source_file'] - raw_output_files = [ - dump_parent_path / '02-ArithmeticAddCalculation' / raw_outputs_relpath / raw_output_file - for raw_output_file in raw_output_files + input_files += [dump_parent_path / '01-multiply' / inputs_relpath / 'source_file'] + output_files = [ + dump_parent_path / '02-ArithmeticAddCalculation' / outputs_relpath / output_file for output_file in output_files ] # No node_inputs contained in MultiplyAddWorkChain - - assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) - assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + assert all([input_file.is_file() for input_file in input_files]) + assert all([output_file.is_file() for output_file in output_files]) # Flat dumping - # Expected tree: - # wc-dump-test-multiply-add-flat - # ├── 01-multiply - # │ └── source_file - # └── 02-ArithmeticAddCalculation - # ├── _aiidasubmit.sh - # ├── _scheduler-stderr.txt - # ├── _scheduler-stdout.txt - # ├── aiida.in - # └── aiida.out - dump_parent_path = tmp_path / 'wc-dump-test-multiply-add-flat' process_dumper = ProcessDumper(flat=True) process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) @@ -280,101 +235,49 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): def test_dump_calculation_node(tmp_path, generate_calculation_node_io): # Checking the actual content should be handled by `test_copy_tree` # Normal dumping -> node_inputs and not flat; no paths provided - # Expected tree: - # cj-dump-test-io - # ├── inputs - # │ ├── folderdata - # │ │ └── relative_path - # │ │ └── file.txt - # │ └── singlefile - # │ └── file.txt - # ├── raw_inputs - # │ └── file.txt - # └── raw_outputs - # ├── folderdata - # │ └── relative_path - # │ └── file.txt - # └── singlefile - # └── file.txt dump_parent_path = tmp_path / 'cj-dump-test-io' - process_dumper = ProcessDumper() + process_dumper = ProcessDumper(include_outputs=True) calculation_node = generate_calculation_node_io() process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - assert (dump_parent_path / raw_inputs_relpath / filename).is_file() + assert (dump_parent_path / inputs_relpath / filename).is_file() assert (dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename).is_file() assert (dump_parent_path / node_inputs_relpath / folderdata_test_path / filename).is_file() - assert (dump_parent_path / raw_outputs_relpath / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / raw_outputs_relpath / folderdata_test_path / filename).is_file() + assert (dump_parent_path / node_inputs_relpath / arraydata_linklabel / 'default.npy').is_file() + + assert (dump_parent_path / node_outputs_relpath / singlefiledata_linklabel / filename).is_file() + assert (dump_parent_path / node_outputs_relpath / folderdata_test_path / filename).is_file() # Check contents once - with open(dump_parent_path / raw_inputs_relpath / filename, 'r') as handle: + with open(dump_parent_path / inputs_relpath / filename, 'r') as handle: assert handle.read() == filecontent with open(dump_parent_path / node_inputs_relpath / singlefiledata_linklabel / filename) as handle: assert handle.read() == filecontent with open(dump_parent_path / node_inputs_relpath / folderdata_test_path / filename) as handle: assert handle.read() == filecontent - with open(dump_parent_path / raw_outputs_relpath / singlefiledata_linklabel / filename) as handle: + with open(dump_parent_path / node_outputs_relpath / singlefiledata_linklabel / filename) as handle: assert handle.read() == filecontent - with open(dump_parent_path / raw_outputs_relpath / folderdata_test_path / filename) as handle: + with open(dump_parent_path / node_outputs_relpath / folderdata_test_path / filename) as handle: assert handle.read() == filecontent -# Could probably be removed with proper testing of the mapping -def test_dump_calculation_custom(tmp_path, generate_calculation_node_io): - # Normal dumping -> node_inputs and not flat; custom paths provided - # Expected tree: - # cj-dump-test-io - # ├── inputs_ - # │ ├── folderdata - # │ │ └── relative_path - # │ │ └── file.txt - # │ └── singlefile - # │ └── file.txt - # ├── raw_inputs_ - # │ └── file.txt - # └── raw_outputs_ - # ├── folderdata - # │ └── relative_path - # │ └── file.txt - # └── singlefile - # └── file.txt - - dump_parent_path = tmp_path / 'cj-dump-test-custom' - process_dumper = ProcessDumper() - calculation_node = generate_calculation_node_io() - process_dumper._dump_calculation( - calculation_node=calculation_node, output_path=dump_parent_path, io_dump_paths=custom_dump_paths - ) - - assert (dump_parent_path / custom_dump_paths[0] / filename).is_file() - assert (dump_parent_path / custom_dump_paths[1] / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / custom_dump_paths[1] / folderdata_test_path / filename).is_file() - assert (dump_parent_path / custom_dump_paths[2] / singlefiledata_linklabel / filename).is_file() - assert (dump_parent_path / custom_dump_paths[2] / folderdata_test_path / filename).is_file() - - def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): # Flat dumping -> no paths provided -> Default paths should not be existent. # Internal FolderData structure retained. - # Expected tree: - # cj-dump-test-io - # ├── file.txt - # └── relative_path - # └── file.txt dump_parent_path = tmp_path / 'cj-dump-test-custom' process_dumper = ProcessDumper(flat=True) calculation_node = generate_calculation_node_io() process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - # Here, the same file will be written by raw_inputs and raw_outputs and node_inputs + # Here, the same file will be written by inputs and node_outputs and node_inputs # So it should only be present in the parent dump directory - assert not (dump_parent_path / raw_inputs_relpath).is_dir() + assert not (dump_parent_path / inputs_relpath).is_dir() assert not (dump_parent_path / node_inputs_relpath).is_dir() - assert not (dump_parent_path / raw_outputs_relpath).is_dir() + assert not (dump_parent_path / outputs_relpath).is_dir() assert (dump_parent_path / filename).is_file() + assert (dump_parent_path / 'default.npy').is_file() assert (dump_parent_path / folderdata_relpath / filename).is_file() @@ -389,6 +292,7 @@ def test_dump_calculation_overwrite(tmp_path, generate_calculation_node_io): process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) +# With both inputs and outputs being dumped is the standard test case above, so only test without inputs here def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): dump_parent_path = tmp_path / 'cj-dump-test-noinputs' process_dumper = ProcessDumper(include_inputs=False) @@ -397,30 +301,20 @@ def test_dump_calculation_no_inputs(tmp_path, generate_calculation_node_io): assert not (dump_parent_path / node_inputs_relpath).is_dir() -def test_dump_calculation_all_aiida_nodes(tmp_path, generate_calculation_node_io): - dump_parent_path = tmp_path / 'cj-dump-test-allaiidanodes' - process_dumper = ProcessDumper(all_aiida_nodes=True) - calculation_node = generate_calculation_node_io() - process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) - assert (dump_parent_path / node_inputs_relpath / '.aiida_nodes' / arraydata_linklabel / 'default.npy').is_file() - - def test_dump_calculation_add(tmp_path, generate_calculation_node_add): dump_parent_path = tmp_path / 'cj-dump-test-add' - process_dumper = ProcessDumper(all_aiida_nodes=True) + process_dumper = ProcessDumper() calculation_node_add = generate_calculation_node_add() process_dumper._dump_calculation(calculation_node=calculation_node_add, output_path=dump_parent_path) - raw_input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] - raw_output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] - raw_input_files = [dump_parent_path / raw_inputs_relpath / raw_input_file for raw_input_file in raw_input_files] - raw_output_files = [ - dump_parent_path / raw_outputs_relpath / raw_output_file for raw_output_file in raw_output_files - ] + input_files = ['_aiidasubmit.sh', 'aiida.in', '.aiida/job_tmpl.json', '.aiida/calcinfo.json'] + output_files = ['_scheduler-stderr.txt', '_scheduler-stdout.txt', 'aiida.out'] + input_files = [dump_parent_path / inputs_relpath / input_file for input_file in input_files] + output_files = [dump_parent_path / outputs_relpath / output_file for output_file in output_files] - assert all([raw_input_file.is_file() for raw_input_file in raw_input_files]) - assert all([raw_output_file.is_file() for raw_output_file in raw_output_files]) + assert all([input_file.is_file() for input_file in input_files]) + assert all([output_file.is_file() for output_file in output_files]) # Tests for helper methods @@ -432,7 +326,7 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): safeguard_file = node_metadata_file # Path must be provided - process_dumper = ProcessDumper() + process_dumper = ProcessDumper(overwrite=False) with pytest.raises(TypeError): process_dumper._validate_make_dump_path() @@ -449,9 +343,10 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): clean_tmp_path(tmp_path=tmp_path) - # Fails if directory not empty and overwrite set to False + # Fails if directory not empty, safeguard file existent, and overwrite set to False test_dir_abs.mkdir() (test_dir_abs / filename).touch() + (test_dir_abs / node_metadata_file).touch() with pytest.raises(FileExistsError): output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() @@ -460,9 +355,10 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): process_dumper = ProcessDumper(overwrite=True) # Fails if directory not empty and overwrite set to True, but safeguard_file not found (for safety reasons) + # Could define new Exception for this... test_dir_abs.mkdir() (test_dir_abs / filename).touch() - with pytest.raises(FileExistsError): + with pytest.raises(Exception): output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) assert (test_dir_abs / filename).is_file() @@ -493,32 +389,16 @@ def test_generate_default_dump_path( def test_generate_calculation_io_mapping(): process_dumper = ProcessDumper() calculation_io_mapping = process_dumper._generate_calculation_io_mapping() - assert calculation_io_mapping.repository == 'raw_inputs' - assert calculation_io_mapping.inputs == 'inputs' - assert calculation_io_mapping.outputs == 'raw_outputs' + assert calculation_io_mapping.repository == 'inputs' + assert calculation_io_mapping.retrieved == 'outputs' + assert calculation_io_mapping.inputs == 'node_inputs' + assert calculation_io_mapping.outputs == 'node_outputs' calculation_io_mapping = process_dumper._generate_calculation_io_mapping(io_dump_paths=custom_dump_paths) - assert calculation_io_mapping.repository == 'raw_inputs_' - assert calculation_io_mapping.inputs == 'inputs_' - assert calculation_io_mapping.outputs == 'raw_outputs_' - - -def test_generate_link_triple_dump_path(generate_calculation_node_io, generate_workchain_node_io, tmp_path): - # Need to construct WorkChain, as the path naming is based on `LinkTriple`s - cj_node = generate_calculation_node_io(attach_outputs=False) - wc_node = generate_workchain_node_io(cj_nodes=[cj_node]) - wc_output_triples = wc_node.base.links.get_outgoing().all() - sub_wc_node = wc_output_triples[0].node - output_triples = wc_output_triples + sub_wc_node.base.links.get_outgoing().all() - - process_dumper = ProcessDumper() - - output_paths = [ - tmp_path / process_dumper._generate_link_triple_dump_path(link_triple=output_triple, parent_path=tmp_path) - for output_triple in output_triples - ] - # 'sub_workflow' doesn't have a repository, so it is placed under '.aiida_nodes' - assert output_paths == [tmp_path / 'sub_workflow', tmp_path / 'calculation'] + assert calculation_io_mapping.repository == 'inputs_' + assert calculation_io_mapping.retrieved == 'outputs_' + assert calculation_io_mapping.inputs == 'node_inputs_' + assert calculation_io_mapping.outputs == 'node_outputs_' def test_generate_child_node_label( @@ -599,9 +479,9 @@ def test_dump_node_yaml(generate_calculation_node_io, tmp_path, generate_workcha def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): wc_node = generate_workchain_multiply_add() - process_dumper = ProcessDumper(parent_process=wc_node, parent_path=tmp_path) + process_dumper = ProcessDumper() - process_dumper.generate_parent_readme() + process_dumper._generate_readme(process_node=wc_node, output_path=tmp_path) assert (tmp_path / 'README').is_file() From deb88679ffde9f06d74b59214ff1f6cfa88346ef Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Sun, 19 May 2024 15:08:56 +0200 Subject: [PATCH 27/30] Resolve comments of reviews by Seb and Alex --- src/aiida/cmdline/commands/cmd_process.py | 61 +++++++++++++------ src/aiida/cmdline/params/options/main.py | 45 +------------- src/aiida/tools/dumping/processes.py | 45 +++++++------- tests/cmdline/commands/test_process.py | 6 -- tests/conftest.py | 3 +- tests/tools/dumping/test_processes.py | 71 +++++++---------------- 6 files changed, 89 insertions(+), 142 deletions(-) diff --git a/src/aiida/cmdline/commands/cmd_process.py b/src/aiida/cmdline/commands/cmd_process.py index 6c5d60c9e5..52b286e795 100644 --- a/src/aiida/cmdline/commands/cmd_process.py +++ b/src/aiida/cmdline/commands/cmd_process.py @@ -487,31 +487,58 @@ def process_repair(manager, broker, dry_run): @arguments.PROCESS() @options.PATH() @options.OVERWRITE() -@options.INCLUDE_INPUTS() -@options.INCLUDE_OUTPUTS() -@options.INCLUDE_ATTRIBUTES() -@options.INCLUDE_EXTRAS() -@options.FLAT() +@click.option( + '--include-inputs/--exclude-inputs', + default=True, + show_default=True, + help='Include the linked input nodes of the `CalculationNode`(s).', +) +@click.option( + '--include-outputs/--exclude-outputs', + default=False, + show_default=True, + help='Include the linked output nodes of the `CalculationNode`(s).', +) +@click.option( + '--include-attributes/--exclude-attributes', + default=True, + show_default=True, + help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) +@click.option( + '--include-extras/--exclude-extras', + default=True, + show_default=True, + help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', +) +@click.option( + '-f', + '--flat', + is_flag=True, + default=False, + help='Dump files in a flat directory for every step of the workflow.', +) def process_dump( process, path, + overwrite, include_inputs, include_outputs, include_attributes, include_extras, - overwrite, flat, ) -> None: """Dump process input and output files to disk. - Child calculations/workflows (also called `CalcJob`s and `WorkChain`s in AiiDA jargon) run by the parent workflow - are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus - mirrors the logical execution of the workflow, which can also be queried by running `verdi process status ` on - the command line. + Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA + jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their + creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried by + running `verdi process status ` on the command line. - By default, input and output files of each calculation can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution - settings). Additional input files (depending on the type of calculation) are placed in the "inputs". + By default, input and output files of each calculation can be found in the corresponding "inputs" and + "outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution + settings). Additional input and output files (depending on the type of calculation) are placed in the "node_inputs" + and "node_outputs", respectively. Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection. @@ -532,11 +559,9 @@ def process_dump( dump_path = process_dumper.dump(process_node=process, output_path=path) except FileExistsError: echo.echo_critical( - 'Dumping directory exists and overwrite is False. ' 'Set overwrite to True, or delete directory manually.' + 'Dumping directory exists and overwrite is False. Set overwrite to True, or delete directory manually.' ) except Exception as e: - echo.echo_critical(f'Unexpected error ({e!s}) while dumping {process.__class__.__name__} <{process.pk}>.') + echo.echo_critical(f'Unexpected error while dumping {process.__class__.__name__} <{process.pk}>:\n ({e!s}).') - echo.echo_success( - f'Raw files for {process.__class__.__name__} <{process.pk}> dumped successfully in `{dump_path}`.' - ) + echo.echo_success(f'Raw files for {process.__class__.__name__} <{process.pk}> dumped into folder `{dump_path}`.') diff --git a/src/aiida/cmdline/params/options/main.py b/src/aiida/cmdline/params/options/main.py index be9dce4cff..85b3090ad5 100644 --- a/src/aiida/cmdline/params/options/main.py +++ b/src/aiida/cmdline/params/options/main.py @@ -60,7 +60,6 @@ 'EXIT_STATUS', 'EXPORT_FORMAT', 'FAILED', - 'FLAT', 'FORCE', 'FORMULA_MODE', 'FREQUENCY', @@ -69,10 +68,6 @@ 'GROUP_CLEAR', 'HOSTNAME', 'IDENTIFIER', - 'INCLUDE_ATTRIBUTES', - 'INCLUDE_INPUTS', - 'INCLUDE_OUTPUTS', - 'INCLUDE_EXTRAS', 'INPUT_FORMAT', 'INPUT_PLUGIN', 'LABEL', @@ -758,35 +753,7 @@ def set_log_level(_ctx, _param, value): '--path', type=click.Path(path_type=pathlib.Path), show_default=False, - help='Parent directory for the dumping.', -) - -INCLUDE_INPUTS = OverridableOption( - '--include-inputs/--exclude-inputs', - default=True, - show_default=True, - help='Include the linked input nodes of the `CalculationNode`(s).', -) - -INCLUDE_OUTPUTS = OverridableOption( - '--include-outputs/--exclude-outputs', - default=False, - show_default=True, - help='Include the linked output nodes of the `CalculationNode`(s).', -) - -INCLUDE_ATTRIBUTES = OverridableOption( - '--include-attributes/--exclude-attributes', - default=True, - show_default=True, - help='Include attributes in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', -) - -INCLUDE_EXTRAS = OverridableOption( - '--include-extras/--exclude-extras', - default=True, - show_default=True, - help='Include extras in the `.aiida_node_metadata.yaml` written for every `ProcessNode`.', + help='Base path for operations that write to disk.', ) OVERWRITE = OverridableOption( @@ -795,13 +762,5 @@ def set_log_level(_ctx, _param, value): is_flag=True, default=False, show_default=True, - help='Overwrite directory if it already exists.', -) - -FLAT = OverridableOption( - '-f', - '--flat', - is_flag=True, - default=False, - help='Dump files in a flat directory for every step of the workflow.', + help='Overwrite file/directory if writing to disk.', ) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 3007d11ed8..6543353b70 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -13,7 +13,7 @@ import logging from pathlib import Path from types import SimpleNamespace -from typing import Any, List, Optional +from typing import List import yaml @@ -41,7 +41,7 @@ def __init__( include_outputs: bool = False, include_attributes: bool = True, include_extras: bool = True, - overwrite: bool = True, + overwrite: bool = False, flat: bool = False, ) -> None: self.include_inputs = include_inputs @@ -98,10 +98,10 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried by running `verdi process status {process_node.pk}` on the command line. - By default, input and output files of each simulation can be found in the corresponding "raw_inputs" and - "raw_outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job - execution settings). Additional input files (depending on the type of calculation) are placed in the - "inputs". + By default, input and output files of each calculation can be found in the corresponding "inputs" and "outputs" + directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). + Additional input and output files (depending on the type of calculation) are placed in the "node_inputs" and + "node_outputs", respectively. Lastly, every folder also contains a hidden, human-readable `.aiida_node_metadata.yaml` file with the relevant AiiDA node data for further inspection.""" @@ -128,8 +128,7 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: process_show = get_node_info(node=process_node) _readme_string += f'\n\n\nOutput of `verdi process show {process_node.pk}`:\n\n{process_show}' - with (output_path / 'README').open('w') as handle: - handle.write(_readme_string) + (output_path / 'README').write_text(_readme_string) @staticmethod def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: @@ -166,7 +165,7 @@ def dump( self, process_node: ProcessNode, output_path: Path | None, - io_dump_paths: list | None = None, + io_dump_paths: List[str | Path] | None = None, ) -> Path: """Dumps all data involved in a `ProcessNode`, including its outgoing links. @@ -202,27 +201,28 @@ def dump( return output_path - def _dump_workflow(self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: list | None = None) -> None: - """Recursive function to traverse a ``WorkflowNode`` and dump its ``CalculationNode``s. + def _dump_workflow( + self, workflow_node: WorkflowNode, output_path: Path, io_dump_paths: List[str | Path] | None = None + ) -> None: + """Recursive function to traverse a `WorkflowNode` and dump its `CalculationNode` s. - :param workflow_node: ``WorkflowNode`` to be traversed. Will be updated during recursion. + :param workflow_node: `WorkflowNode` to be traversed. Will be updated during recursion. :param output_path: Dumping parent directory. Will be updated during recursion. - :param io_dump_paths: Custom subdirectories for ``CalculationNode``s, defaults to None + :param io_dump_paths: Custom subdirectories for `CalculationNode` s, defaults to None """ - self._validate_make_dump_path(validate_path=output_path) + self._validate_make_dump_path(validate_path=output_path) self._dump_node_yaml(process_node=workflow_node, output_path=output_path) called_links = workflow_node.base.links.get_outgoing(link_type=(LinkType.CALL_CALC, LinkType.CALL_WORK)).all() + called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) - sorted_called_links = sorted(called_links, key=lambda link_triple: link_triple.node.ctime) - - for index, link_triple in enumerate(sorted_called_links, start=1): + for index, link_triple in enumerate(called_links, start=1): child_node = link_triple.node child_label = self._generate_child_node_label(index=index, link_triple=link_triple) child_output_path = output_path.resolve() / child_label - # Recursive function call for `WorkFlowNode`` + # Recursive function call for `WorkFlowNode` if isinstance(child_node, WorkflowNode): self._dump_workflow( workflow_node=child_node, @@ -242,7 +242,7 @@ def _dump_calculation( self, calculation_node: CalculationNode, output_path: Path, - io_dump_paths: list | None = None, + io_dump_paths: List[str | Path] | None = None, ) -> None: """Dump the contents of a `CalculationNode` to a specified output path. @@ -253,7 +253,6 @@ def _dump_calculation( """ self._validate_make_dump_path(validate_path=output_path) - self._dump_node_yaml(process_node=calculation_node, output_path=output_path) io_dump_mapping = self._generate_calculation_io_mapping(io_dump_paths=io_dump_paths) @@ -285,7 +284,7 @@ def _dump_calculation( ) def _dump_calculation_io(self, parent_path: Path, link_triples: LinkManager | List[LinkTriple]): - """Small helper function to dump linked input/output nodes of ``CalculationNode``s. + """Small helper function to dump linked input/output nodes of a `CalculationNode`. :param parent_path: Parent directory for dumping the linked node contents. :param link_triples: List of link triples. @@ -334,7 +333,7 @@ def _validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '. else: raise Exception( f"Path `{validate_path}` already exists and doesn't contain safeguard file {safeguard_file}." - f'Not removing for safety reasons.' + f' Not removing for safety reasons.' ) # Not included in if-else as to avoid having to repeat the `mkdir` call. @@ -343,7 +342,7 @@ def _validate_make_dump_path(self, validate_path: Path, safeguard_file: str = '. return validate_path.resolve() - def _generate_calculation_io_mapping(self, io_dump_paths: Optional[List[Any]] = None) -> SimpleNamespace: + def _generate_calculation_io_mapping(self, io_dump_paths: List[str | Path] | None = None) -> SimpleNamespace: """Helper function to generate mapping for entities dumped for each `CalculationNode`. This is to avoid exposing AiiDA terminology, like `repository` to the user, while keeping track of which diff --git a/tests/cmdline/commands/test_process.py b/tests/cmdline/commands/test_process.py index 8553e6a613..73c9ac7084 100644 --- a/tests/cmdline/commands/test_process.py +++ b/tests/cmdline/commands/test_process.py @@ -344,12 +344,6 @@ def test_process_dump(self, run_cli_command, tmp_path, generate_workchain_multip test_path = tmp_path / 'cli-dump' node = generate_workchain_multiply_add() - # Running without identifiers should not except and not print anything - options = [] - result = run_cli_command(cmd_process.process_dump, options) - assert result.exception is None, result.output - assert len(result.output_lines) == 0 - # Giving a single identifier should print a non empty string message options = [str(node.pk), '-p', str(test_path)] result = run_cli_command(cmd_process.process_dump, options) diff --git a/tests/conftest.py b/tests/conftest.py index 9d069e14e0..55bf01a185 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -185,8 +185,7 @@ def _generate_calculation_node( calculation_node.set_exit_status(exit_status) if repository is not None: - # Or use from_tree? - calculation_node.base.repository.put_object_from_filelike(repository[0], repository[1]) + calculation_node.base.repository.put_object_from_tree(repository) # For storing, need to first store the input nodes, then the CalculationNode, then the output nodes if inputs is not None: diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index ffc281f164..cb77f2a5cb 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -8,14 +8,9 @@ ########################################################################### """Tests for the dumping of ProcessNode data to disk.""" -# For testing the dumping, one either needs to cd into the tmp_path, or pass the tmp_path as argument, otherwise, the -# files are dumped into the src -> CWD from where the script is run. -# However, when one passes tmp_dir as output_path, no automatic, default path is created - from __future__ import annotations import io -import shutil from pathlib import Path import pytest @@ -41,25 +36,13 @@ node_metadata_file = '.aiida_node_metadata.yaml' -# Move this somewhere else? -def clean_tmp_path(tmp_path: Path): - """Recursively delete files and directories in a path, e.g. a temporary path used by pytest.""" - - for item in tmp_path.iterdir(): - if item.is_dir(): - shutil.rmtree(item) - else: - item.unlink() - - # Helper functions to generate the actual `WorkflowNode`s and `CalculationNode`s used for testing @pytest.fixture -def generate_calculation_node_io(generate_calculation_node): +def generate_calculation_node_io(generate_calculation_node, tmp_path): def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs: bool = True): import numpy as np from aiida.orm import ArrayData, FolderData, SinglefileData - calculation_repository = (io.StringIO(filecontent), filename) singlefiledata_input = SinglefileData.from_string(content=filecontent, filename=filename) # ? Use instance for folderdata folderdata = FolderData() @@ -84,8 +67,11 @@ def _generate_calculation_node_io(entry_point: str | None = None, attach_outputs else: calculation_outputs = None + # Actually write repository file and then read it in when generating calculation_node + (tmp_path / filename).write_text(filecontent) + calculation_node = generate_calculation_node( - repository=calculation_repository, + repository=tmp_path, inputs=calculation_node_inputs, outputs=calculation_outputs, entry_point=entry_point, @@ -207,7 +193,6 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): assert all([output_file.is_file() for output_file in output_files]) # Flat dumping - dump_parent_path = tmp_path / 'wc-dump-test-multiply-add-flat' process_dumper = ProcessDumper(flat=True) process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) @@ -234,8 +219,8 @@ def test_dump_multiply_add(tmp_path, generate_workchain_multiply_add): # Tests for dump_calculation method def test_dump_calculation_node(tmp_path, generate_calculation_node_io): # Checking the actual content should be handled by `test_copy_tree` - # Normal dumping -> node_inputs and not flat; no paths provided + # Normal dumping -> node_inputs and not flat; no paths provided dump_parent_path = tmp_path / 'cj-dump-test-io' process_dumper = ProcessDumper(include_outputs=True) calculation_node = generate_calculation_node_io() @@ -265,14 +250,13 @@ def test_dump_calculation_node(tmp_path, generate_calculation_node_io): def test_dump_calculation_flat(tmp_path, generate_calculation_node_io): # Flat dumping -> no paths provided -> Default paths should not be existent. # Internal FolderData structure retained. - dump_parent_path = tmp_path / 'cj-dump-test-custom' process_dumper = ProcessDumper(flat=True) calculation_node = generate_calculation_node_io() process_dumper._dump_calculation(calculation_node=calculation_node, output_path=dump_parent_path) # Here, the same file will be written by inputs and node_outputs and node_inputs - # So it should only be present in the parent dump directory + # So it should only be present once in the parent dump directory assert not (dump_parent_path / inputs_relpath).is_dir() assert not (dump_parent_path / node_inputs_relpath).is_dir() assert not (dump_parent_path / outputs_relpath).is_dir() @@ -321,8 +305,6 @@ def test_dump_calculation_add(tmp_path, generate_calculation_node_add): def test_validate_make_dump_path(chdir_tmp_path, tmp_path): chdir_tmp_path - test_dir = Path('test-dir') - test_dir_abs = tmp_path / test_dir safeguard_file = node_metadata_file # Path must be provided @@ -331,45 +313,34 @@ def test_validate_make_dump_path(chdir_tmp_path, tmp_path): process_dumper._validate_make_dump_path() # Check if path created if non-existent + test_dir = tmp_path / Path('test-dir') + test_dir.mkdir() output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) - assert output_path == test_dir_abs - - clean_tmp_path(tmp_path=tmp_path) + assert output_path == test_dir # Empty path is fine -> No error and full path returned - test_dir_abs.mkdir() output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) - assert output_path == test_dir_abs - - clean_tmp_path(tmp_path=tmp_path) + assert output_path == test_dir # Fails if directory not empty, safeguard file existent, and overwrite set to False - test_dir_abs.mkdir() - (test_dir_abs / filename).touch() - (test_dir_abs / node_metadata_file).touch() + (test_dir / filename).touch() + (test_dir / safeguard_file).touch() with pytest.raises(FileExistsError): output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) - assert (test_dir_abs / filename).is_file() - - clean_tmp_path(tmp_path=tmp_path) + assert (test_dir / filename).is_file() + # Works if directory not empty, but overwrite=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained process_dumper = ProcessDumper(overwrite=True) + output_path = process_dumper._validate_make_dump_path(validate_path=test_dir, safeguard_file=safeguard_file) + assert output_path == test_dir + assert not (test_dir / safeguard_file).is_file() + # Fails if directory not empty and overwrite set to True, but safeguard_file not found (for safety reasons) # Could define new Exception for this... - test_dir_abs.mkdir() - (test_dir_abs / filename).touch() + (test_dir / filename).touch() with pytest.raises(Exception): output_path = process_dumper._validate_make_dump_path(validate_path=test_dir) - assert (test_dir_abs / filename).is_file() - - clean_tmp_path(tmp_path=tmp_path) - - # Works if directory not empty, but overwrite=True and safeguard_file (e.g. `.aiida_node_metadata.yaml`) contained - test_dir_abs.mkdir() - (test_dir_abs / safeguard_file).touch() - output_path = process_dumper._validate_make_dump_path(validate_path=test_dir, safeguard_file=safeguard_file) - assert output_path == test_dir_abs - assert not (test_dir_abs / safeguard_file).is_file() + assert (test_dir / filename).is_file() def test_generate_default_dump_path( From 8cf53afa2e40d4254e681087c7d7310c2146645c Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Tue, 21 May 2024 19:46:11 +0200 Subject: [PATCH 28/30] Remove underscores for metadata.yaml properties --- src/aiida/tools/dumping/processes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 6543353b70..7f8d9fcf15 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -6,7 +6,7 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -"""Functionality for dumping of WorkChains and CalcJobs.""" +"""Functionality for dumping of ProcessNodes.""" from __future__ import annotations @@ -392,7 +392,7 @@ def _dump_node_yaml( :param output_filename: The name of the output YAML file. Defaults to `.aiida_node_metadata.yaml`. """ - _node_properties = [ + node_properties = [ 'label', 'description', 'pk', @@ -404,15 +404,15 @@ def _dump_node_yaml( 'is_finished_ok', ] - _user_properties = ('first_name', 'last_name', 'email', 'institution') + user_properties = ('first_name', 'last_name', 'email', 'institution') - _computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') + computer_properties = ('label', 'hostname', 'scheduler_type', 'transport_type') node_dict = {} metadata_dict = {} # Add actual node `@property`s to dictionary - for metadata_property in _node_properties: + for metadata_property in node_properties: metadata_dict[metadata_property] = getattr(process_node, metadata_property) node_dict['Node data'] = metadata_dict @@ -421,7 +421,7 @@ def _dump_node_yaml( try: node_dbuser = process_node.user user_dict = {} - for user_property in _user_properties: + for user_property in user_properties: user_dict[user_property] = getattr(node_dbuser, user_property) node_dict['User data'] = user_dict except AttributeError: @@ -431,7 +431,7 @@ def _dump_node_yaml( try: node_dbcomputer = process_node.computer computer_dict = {} - for computer_property in _computer_properties: + for computer_property in computer_properties: computer_dict[computer_property] = getattr(node_dbcomputer, computer_property) node_dict['Computer data'] = computer_dict except AttributeError: From 74facc2aadc72ab15eec6e5ac9edeb0e84c38fca Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Wed, 22 May 2024 10:23:46 +0200 Subject: [PATCH 29/30] Update documentation --- docs/source/howto/data.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/howto/data.rst b/docs/source/howto/data.rst index 712c092b46..8905a3ea2c 100644 --- a/docs/source/howto/data.rst +++ b/docs/source/howto/data.rst @@ -98,7 +98,7 @@ For our beloved ``MultiplyAddWorkChain``, we obtain the following: .. code-block:: shell $ verdi process dump -p dump-multiply_add - Success: Raw files for WorkChainNode dumped successfully in `dump-multiply_add`. + Success: Raw files for WorkChainNode dumped into folder `dump-multiply_add`. .. code-block:: shell @@ -108,17 +108,17 @@ For our beloved ``MultiplyAddWorkChain``, we obtain the following: ├── .aiida_node_metadata.yaml ├── 01-multiply │ ├── .aiida_node_metadata.yaml - │ └── raw_inputs + │ └── inputs │ └── source_file └── 02-ArithmeticAddCalculation ├── .aiida_node_metadata.yaml - ├── raw_inputs + ├── inputs │ ├── .aiida │ │ ├── calcinfo.json │ │ └── job_tmpl.json │ ├── _aiidasubmit.sh │ └── aiida.in - └── raw_outputs + └── outputs ├── _scheduler-stderr.txt ├── _scheduler-stdout.txt └── aiida.out @@ -126,22 +126,22 @@ For our beloved ``MultiplyAddWorkChain``, we obtain the following: The ``README`` file provides a description of the directory structure, as well as useful information about the top-level process. Further, numbered subdirectories are created for each step of the workflow, resulting in the ``01-multiply`` and ``02-ArithmeticAddCalculation`` folders. The raw calculation input and output files ``aiida.in`` and ``aiida.out`` -of the ``ArithmeticAddCalculation`` are placed in ``raw_inputs`` and ``raw_outputs``. In addition, these also contain +of the ``ArithmeticAddCalculation`` are placed in ``inputs`` and ``outputs``. In addition, these also contain the submission script ``_aiidasubmit.sh``, as well as the scheduler stdout and stderr, ``_scheduler-stdout.txt`` and -``_scheduler-stderr.txt``, respectively. Lastly, the source code of the ``multiply```` ``calcfunction`` presenting the +``_scheduler-stderr.txt``, respectively. Lastly, the source code of the ``multiply`` ``calcfunction`` presenting the first step of the workflow is contained in the ``source_file``. Upon having a closer look at the directory, we also find the hidden ``.aiida_node_metadata.yaml`` files, which are created for every ``ProcessNode`` and contain additional information about the ``Node``, the ``User``, and the ``Computer``, as well as the ``.aiida`` subdirectory with machine-readable AiiDA-internal data in JSON format. -Since subprocesses are explored recursively, arbitrarily complex, nested workflows can be dumped. As already seen above, -the ``-p`` flag allows to specify a custom dumping path. If none is provided, it is automatically generated from the -``process_label`` (or ``process_type``) and the ``pk``. In addition, the command provides the ``-o`` flag to overwrite -existing directories, the ``-a`` flag to dump further, non file-based AiiDA nodes (in hidden, ``.aiida_nodes`` -subdirectories), the ``-f`` flag to dump all files for each ``CalculationNode`` of the workflow in a flat directory -structure, and the ``--include-inputs`` (``--exclude-inputs``) flag to also dump additional node inputs of each -``CalculationNode`` of the workflow. For a full list of available options, call :code:`verdi process dump --help`. +Since child processes are explored recursively, arbitrarily complex, nested workflows can be dumped. As already seen +above, the ``-p`` flag allows to specify a custom dumping path. If none is provided, it is automatically generated from +the ``process_label`` (or ``process_type``) and the ``pk``. In addition, the command provides the ``-o`` flag to +overwrite existing directories, the ``-f`` flag to dump all files for each ``CalculationNode`` of the workflow in a flat +directory structure, and the ``--include-inputs/--exclude-inputs`` (``--include-outputs/--exclude-outputs``) flags to +also dump additional node inputs (outputs) of each ``CalculationNode`` of the workflow into ``node_inputs`` +(``node_outputs``) subdirectories. For a full list of available options, call :code:`verdi process dump --help`. .. _how-to:data:import:provenance: From 497b14c27cf9c7531bf47d47cbc292139572b2bd Mon Sep 17 00:00:00 2001 From: Julian Geiger Date: Thu, 23 May 2024 21:38:52 +0200 Subject: [PATCH 30/30] README as `.md` and wrap `cmd_process` outputs --- docs/source/howto/data.rst | 14 +++++++------- src/aiida/tools/dumping/processes.py | 16 +++++++++------- tests/tools/dumping/test_processes.py | 6 +++--- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/source/howto/data.rst b/docs/source/howto/data.rst index 8905a3ea2c..33c9c33aba 100644 --- a/docs/source/howto/data.rst +++ b/docs/source/howto/data.rst @@ -104,7 +104,7 @@ For our beloved ``MultiplyAddWorkChain``, we obtain the following: $ tree -a dump-multiply_add dump-multiply_add - ├── README + ├── README.md ├── .aiida_node_metadata.yaml ├── 01-multiply │ ├── .aiida_node_metadata.yaml @@ -123,12 +123,12 @@ For our beloved ``MultiplyAddWorkChain``, we obtain the following: ├── _scheduler-stdout.txt └── aiida.out -The ``README`` file provides a description of the directory structure, as well as useful information about the top-level -process. Further, numbered subdirectories are created for each step of the workflow, resulting in the ``01-multiply`` -and ``02-ArithmeticAddCalculation`` folders. The raw calculation input and output files ``aiida.in`` and ``aiida.out`` -of the ``ArithmeticAddCalculation`` are placed in ``inputs`` and ``outputs``. In addition, these also contain -the submission script ``_aiidasubmit.sh``, as well as the scheduler stdout and stderr, ``_scheduler-stdout.txt`` and -``_scheduler-stderr.txt``, respectively. Lastly, the source code of the ``multiply`` ``calcfunction`` presenting the +The ``README.md`` file provides a description of the directory structure, as well as useful information about the +top-level process. Further, numbered subdirectories are created for each step of the workflow, resulting in the +``01-multiply`` and ``02-ArithmeticAddCalculation`` folders. The raw calculation input and output files ``aiida.in`` and +``aiida.out`` of the ``ArithmeticAddCalculation`` are placed in ``inputs`` and ``outputs``. In addition, these also +contain the submission script ``_aiidasubmit.sh``, as well as the scheduler stdout and stderr, ``_scheduler-stdout.txt`` +and ``_scheduler-stderr.txt``, respectively. Lastly, the source code of the ``multiply`` ``calcfunction`` presenting the first step of the workflow is contained in the ``source_file``. Upon having a closer look at the directory, we also find the hidden ``.aiida_node_metadata.yaml`` files, which are diff --git a/src/aiida/tools/dumping/processes.py b/src/aiida/tools/dumping/processes.py index 7f8d9fcf15..3d970c421c 100644 --- a/src/aiida/tools/dumping/processes.py +++ b/src/aiida/tools/dumping/processes.py @@ -71,7 +71,7 @@ def _generate_default_dump_path(process_node: ProcessNode) -> Path: @staticmethod def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: - """Generate README file in main dumping directory. + """Generate README.md file in main dumping directory. :param process_node: `CalculationNode` or `WorkflowNode`. :param output_path: Output path for dumping. @@ -88,15 +88,17 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: get_workchain_report, ) + pk = process_node.pk + _readme_string = textwrap.dedent( f"""\ This directory contains the files involved in the calculation/workflow - `{process_node.process_label} <{process_node.pk}>` run with AiiDA. + `{process_node.process_label} <{pk}>` run with AiiDA. Child calculations/workflows (also called `CalcJob`s/`CalcFunction`s and `WorkChain`s/`WorkFunction`s in AiiDA jargon) run by the parent workflow are contained in the directory tree as sub-folders and are sorted by their creation time. The directory tree thus mirrors the logical execution of the workflow, which can also be queried - by running `verdi process status {process_node.pk}` on the command line. + by running `verdi process status {pk}` on the command line. By default, input and output files of each calculation can be found in the corresponding "inputs" and "outputs" directories (the former also contains the hidden ".aiida" folder with machine-readable job execution settings). @@ -109,7 +111,7 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: # `verdi process status` process_status = format_call_graph(calc_node=process_node, max_depth=None, call_link_label=True) - _readme_string += f'\n\n\nOutput of `verdi process status {process_node.pk}:`\n\n{process_status}' + _readme_string += f'\n\n\nOutput of `verdi process status {pk}`:\n\n```shell\n{process_status}\n```' # `verdi process report` # Copied over from `cmd_process` @@ -122,13 +124,13 @@ def _generate_readme(process_node: ProcessNode, output_path: Path) -> None: else: process_report = f'Nothing to show for node type {process_node.__class__}' - _readme_string += f'\n\n\nOutput of `verdi process report {process_node.pk}`:\n\n{process_report}' + _readme_string += f'\n\n\nOutput of `verdi process report {pk}`:\n\n```shell\n{process_report}\n```' # `verdi process show`? process_show = get_node_info(node=process_node) - _readme_string += f'\n\n\nOutput of `verdi process show {process_node.pk}`:\n\n{process_show}' + _readme_string += f'\n\n\nOutput of `verdi process show {pk}`:\n\n```shell\n{process_show}\n```' - (output_path / 'README').write_text(_readme_string) + (output_path / 'README.md').write_text(_readme_string) @staticmethod def _generate_child_node_label(index: int, link_triple: LinkTriple) -> str: diff --git a/tests/tools/dumping/test_processes.py b/tests/tools/dumping/test_processes.py index cb77f2a5cb..371dcb80a9 100644 --- a/tests/tools/dumping/test_processes.py +++ b/tests/tools/dumping/test_processes.py @@ -122,7 +122,7 @@ def test_dump(generate_calculation_node_io, generate_workchain_node_io, tmp_path return_path = process_dumper.dump(process_node=wc_node, output_path=dump_parent_path) assert dump_parent_path.is_dir() - assert (dump_parent_path / 'README').is_file() + assert (dump_parent_path / 'README.md').is_file() assert return_path == dump_parent_path @@ -454,9 +454,9 @@ def test_generate_parent_readme(tmp_path, generate_workchain_multiply_add): process_dumper._generate_readme(process_node=wc_node, output_path=tmp_path) - assert (tmp_path / 'README').is_file() + assert (tmp_path / 'README.md').is_file() - with open(tmp_path / 'README', 'r') as dumped_file: + with open(tmp_path / 'README.md', 'r') as dumped_file: contents = dumped_file.read() assert 'This directory contains' in contents