From d339b6d091928fa4c7f2eec323fa8b2d12698cb1 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 4 Mar 2024 13:09:29 +0100 Subject: [PATCH 01/59] Add initial command and run_command file --- dlt/cli/_dlt.py | 11 +++++++++++ dlt/cli/run_command.py | 6 ++++++ 2 files changed, 17 insertions(+) create mode 100644 dlt/cli/run_command.py diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 2332c0286c..f04ff5fd01 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -563,6 +563,17 @@ def main() -> int: subparsers.add_parser("telemetry", help="Shows telemetry status") + # Run pipeline + run_cmd = subparsers.add_parser("run", help="Run pipelines in a given directory") + + run_cmd.add_argument( + "module", + metavar="N", + type=str, + nargs="+", + help="Path to module or python file with pipelines", + ) + args = parser.parse_args() if Venv.is_virtual_env() and not Venv.is_venv_activated(): diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py new file mode 100644 index 0000000000..0974f6e325 --- /dev/null +++ b/dlt/cli/run_command.py @@ -0,0 +1,6 @@ +from dlt.cli.utils import track_command + + +@track_command(command="run") +def run_pipeline(): + pass From 0aecb8b34f85e190cfed31fa712a14b97129752c Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Tue, 12 Mar 2024 17:43:09 +0100 Subject: [PATCH 02/59] MVP draft for dlt run --- dlt/cli/_dlt.py | 31 ++++++++- dlt/cli/run_command.py | 152 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 177 insertions(+), 6 deletions(-) diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index f04ff5fd01..e252657efa 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -21,6 +21,7 @@ DEFAULT_VERIFIED_SOURCES_REPO, ) from dlt.cli.pipeline_command import pipeline_command, DLT_PIPELINE_COMMAND_DOCS_URL +from dlt.cli.run_command import run_pipeline_command from dlt.cli.telemetry_command import ( DLT_TELEMETRY_DOCS_URL, change_telemetry_status_command, @@ -563,15 +564,37 @@ def main() -> int: subparsers.add_parser("telemetry", help="Shows telemetry status") - # Run pipeline + # CLI pipeline runner run_cmd = subparsers.add_parser("run", help="Run pipelines in a given directory") run_cmd.add_argument( "module", - metavar="N", + help="Path to module or python file with pipelines", + ) + + run_cmd.add_argument( + "pipeline", type=str, + nargs="?", + help="Pipeline name", + ) + + run_cmd.add_argument( + "source", + type=str, + nargs="?", + help="Source or resource name", + ) + + run_cmd.add_argument( + "--args", + "-a", nargs="+", - help="Path to module or python file with pipelines", + default=[], + help=( + "Arguments passed to pipeline.run, example: " + "--args write_disposition=merge loader_file_format=jsonl" + ), ) args = parser.parse_args() @@ -586,6 +609,8 @@ def main() -> int: if args.command == "schema": return schema_command_wrapper(args.file, args.format, args.remove_defaults) + elif args.command == "run": + return run_pipeline_command(args.module, args.pipeline, args.source, args.args) elif args.command == "pipeline": if args.list_pipelines: return pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 0974f6e325..b3a165810c 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -1,6 +1,152 @@ -from dlt.cli.utils import track_command +from itertools import chain +import os +import inspect as it +import typing as t + +from collections import defaultdict +from importlib import util as im +from types import ModuleType +import dlt -@track_command(command="run") -def run_pipeline(): +from dlt.cli import echo as fmt +from dlt.cli.utils import track_command +from dlt.sources import DltResource, DltSource +from typing_extensions import TypedDict + +original_run = dlt.Pipeline.run +def noop(*args, **kwards): pass + +dlt.Pipeline.run = noop + +class PipelineMember(TypedDict): + name: str + instance: t.Union[dlt.Pipeline, DltResource, DltSource] + + +class DltInventory(TypedDict): + pipelines: t.List[PipelineMember] + sources: t.List[PipelineMember] + resources: t.List[PipelineMember] + + +@track_command("run", False) +def run_pipeline_command( + module: str, + pipeline: t.Optional[str] = None, + source: t.Optional[str] = None, + args: t.Optional[str] = None, +): + pick_first_pipeline_and_source = not pipeline and not source + pipeline_module = load_module(module) + + pipeline_members = extract_dlt_members(pipeline_module) + errors = validate_mvp_pipeline(pipeline_members) + if errors: + for err in errors: + fmt.echo(err) + return -1 + + run_options = prepare_run_arguments(args) + dlt.Pipeline.run = original_run + if pick_first_pipeline_and_source: + fmt.echo( + "Neiter of pipeline name or source were specified, " + "we will pick first pipeline and a source to run" + ) + + pipeline_instance = pipeline_members["pipelines"][0]["instance"] + if resources := pipeline_members["resources"]: + data_source = resources[0]["instance"] + else: + data_source = pipeline_members["sources"][0]["instance"] + + pipeline_instance.run(data_source(), **run_options) + else: + pipeline_instance = get_pipeline_by_name(pipeline, pipeline_members) + resource_instance = get_resource_by_name(source, pipeline_members) + pipeline_instance.run(resource_instance(), **run_options) + + +def load_module(module_path: str) -> ModuleType: + if not os.path.exists(module_path): + fmt.echo(f'Module or file "{module_path}" does not exist') + return -1 + + module_name = module_path[:] + if module_name.endswith(".py"): + module_name = module_path[:-3] + + spec = im.spec_from_file_location(module_name, module_path) + module = spec.loader.load_module(module_name) + + return module + + +def extract_dlt_members(module: ModuleType) -> DltInventory: + variables: DltInventory = defaultdict(list) + for name, value in it.getmembers(module): + # We would like to skip private variables and other modules + if not it.ismodule(value) and not name.startswith("_"): + if isinstance(value, dlt.Pipeline): + variables["pipelines"].append( + { + "name": value.pipeline_name, + "instance": value, + } + ) + + if isinstance(value, DltSource): + variables["sources"].append( + { + "name": value.name, + "instance": value, + } + ) + + if isinstance(value, DltResource): + variables["resources"].append( + { + "name": value.name, + "instance": value, + } + ) + + return variables + + +def validate_mvp_pipeline(pipeline_memebers: DltInventory) -> t.List[str]: + errors = [] + if not pipeline_memebers.get("pipelines"): + errors.append("Could not find any pipeline in the given module") + + if not pipeline_memebers.get("resources") and not pipeline_memebers.get("sources"): + errors.append("Could not find any source or resource in the given module") + + return errors + + +def prepare_run_arguments(arglist: t.List[str]) -> t.Dict[str, str]: + run_options = {} + for arg in arglist: + arg_name, value = arg.split("=") + run_options[arg_name] = value + + return run_options + + +def get_pipeline_by_name(pipeline_name: str, members: DltInventory) -> t.Optional[dlt.Pipeline]: + for pipeline_item in members["pipelines"]: + if pipeline_item["name"] == pipeline_name: + return pipeline_item["instace"] + + return None + + +def get_resource_by_name( + resource_name: str, members: DltInventory +) -> t.Optional[t.Union[DltResource, DltSource]]: + for source_item in chain(members["resources"], members["sources"]): + if source_item["name"] == resource_name: + return source_item["instace"] From 6c3695556e28b0ba202cad69df8608b57cefb3c8 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Wed, 13 Mar 2024 19:30:52 +0100 Subject: [PATCH 03/59] Initial implementation of runner usinf reflection and code generation --- dlt/cli/run_command.py | 136 +++------------------- dlt/common/cli/__init__.py | 0 dlt/common/cli/runner.py | 193 +++++++++++++++++++++++++++++++ dlt/common/reflection/utils.py | 7 +- dlt/reflection/script_visitor.py | 5 +- 5 files changed, 216 insertions(+), 125 deletions(-) create mode 100644 dlt/common/cli/__init__.py create mode 100644 dlt/common/cli/runner.py diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index b3a165810c..3a61e1263e 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -1,24 +1,13 @@ -from itertools import chain -import os -import inspect as it import typing as t -from collections import defaultdict -from importlib import util as im -from types import ModuleType - import dlt from dlt.cli import echo as fmt from dlt.cli.utils import track_command +from dlt.common.cli.runner import RunnerInventory, DltRunnerEnvironment from dlt.sources import DltResource, DltSource from typing_extensions import TypedDict -original_run = dlt.Pipeline.run -def noop(*args, **kwards): - pass - -dlt.Pipeline.run = noop class PipelineMember(TypedDict): name: str @@ -39,114 +28,17 @@ def run_pipeline_command( args: t.Optional[str] = None, ): pick_first_pipeline_and_source = not pipeline and not source - pipeline_module = load_module(module) - - pipeline_members = extract_dlt_members(pipeline_module) - errors = validate_mvp_pipeline(pipeline_members) - if errors: - for err in errors: - fmt.echo(err) + inventory = RunnerInventory( + module, + pipeline_name=pipeline, + source_name=source, + args=args, + run_first_pipeline_with_source=pick_first_pipeline_and_source, + ) + + try: + dlt_environment = DltRunnerEnvironment(inventory=inventory) + dlt_environment.run() + except RuntimeError as ex: + fmt.echo(str(ex)) return -1 - - run_options = prepare_run_arguments(args) - dlt.Pipeline.run = original_run - if pick_first_pipeline_and_source: - fmt.echo( - "Neiter of pipeline name or source were specified, " - "we will pick first pipeline and a source to run" - ) - - pipeline_instance = pipeline_members["pipelines"][0]["instance"] - if resources := pipeline_members["resources"]: - data_source = resources[0]["instance"] - else: - data_source = pipeline_members["sources"][0]["instance"] - - pipeline_instance.run(data_source(), **run_options) - else: - pipeline_instance = get_pipeline_by_name(pipeline, pipeline_members) - resource_instance = get_resource_by_name(source, pipeline_members) - pipeline_instance.run(resource_instance(), **run_options) - - -def load_module(module_path: str) -> ModuleType: - if not os.path.exists(module_path): - fmt.echo(f'Module or file "{module_path}" does not exist') - return -1 - - module_name = module_path[:] - if module_name.endswith(".py"): - module_name = module_path[:-3] - - spec = im.spec_from_file_location(module_name, module_path) - module = spec.loader.load_module(module_name) - - return module - - -def extract_dlt_members(module: ModuleType) -> DltInventory: - variables: DltInventory = defaultdict(list) - for name, value in it.getmembers(module): - # We would like to skip private variables and other modules - if not it.ismodule(value) and not name.startswith("_"): - if isinstance(value, dlt.Pipeline): - variables["pipelines"].append( - { - "name": value.pipeline_name, - "instance": value, - } - ) - - if isinstance(value, DltSource): - variables["sources"].append( - { - "name": value.name, - "instance": value, - } - ) - - if isinstance(value, DltResource): - variables["resources"].append( - { - "name": value.name, - "instance": value, - } - ) - - return variables - - -def validate_mvp_pipeline(pipeline_memebers: DltInventory) -> t.List[str]: - errors = [] - if not pipeline_memebers.get("pipelines"): - errors.append("Could not find any pipeline in the given module") - - if not pipeline_memebers.get("resources") and not pipeline_memebers.get("sources"): - errors.append("Could not find any source or resource in the given module") - - return errors - - -def prepare_run_arguments(arglist: t.List[str]) -> t.Dict[str, str]: - run_options = {} - for arg in arglist: - arg_name, value = arg.split("=") - run_options[arg_name] = value - - return run_options - - -def get_pipeline_by_name(pipeline_name: str, members: DltInventory) -> t.Optional[dlt.Pipeline]: - for pipeline_item in members["pipelines"]: - if pipeline_item["name"] == pipeline_name: - return pipeline_item["instace"] - - return None - - -def get_resource_by_name( - resource_name: str, members: DltInventory -) -> t.Optional[t.Union[DltResource, DltSource]]: - for source_item in chain(members["resources"], members["sources"]): - if source_item["name"] == resource_name: - return source_item["instace"] diff --git a/dlt/common/cli/__init__.py b/dlt/common/cli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/common/cli/runner.py b/dlt/common/cli/runner.py new file mode 100644 index 0000000000..4c88a4bd59 --- /dev/null +++ b/dlt/common/cli/runner.py @@ -0,0 +1,193 @@ +import ast +from collections import defaultdict +import inspect +import os +import typing as t + +from dataclasses import dataclass +from importlib import machinery as im +from importlib import util as iu +from types import ModuleType + +import dlt + +from dlt.cli import echo as fmt +from dlt.common.configuration.providers.environ import EnvironProvider +from dlt.common.configuration.providers.toml import ConfigTomlProvider, SecretsTomlProvider +from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext +from dlt.sources import DltResource, DltSource +from dlt.cli.utils import parse_init_script +from dlt.common.reflection.utils import evaluate_node_literal +from dlt.reflection import names as rn +from dlt.reflection.script_visitor import PipelineScriptVisitor + + +@dataclass +class RunnerInventory: + """This class contains parameters passed to command + Also it provides two convenience methods to load script contents + and to get module name from provided `script_path`. + """ + + script_path: str + pipeline_name: t.Optional[str] = None + source_name: t.Optional[str] = None + run_first_pipeline_with_source: t.Optional[bool] = False + args: t.List[str] = None + + @property + def script_contents(self) -> str: + """Loads script contents""" + with open(self.script_path) as fp: + return fp.read() + + @property + def module_name(self) -> str: + """Strips extension with path and returns filename as modulename""" + module_name = self.script_path.split(os.sep)[-1] + if module_name.endswith(".py"): + module_name = module_name[:-3] + + return module_name + + @property + def run_arguments(self) -> t.Dict[str, str]: + run_options = {} + for arg in self.args or []: + arg_name, value = arg.split("=") + run_options[arg_name] = value + + return run_options + + +class PipelineRunner: + def __init__(self, inventory: RunnerInventory, visitor: PipelineScriptVisitor) -> None: + self.inventory = inventory + self.visitor = visitor + self.raise_if_not_runnable() + self.pipeline_source = self.strip_pipeline_runs() + self.module = self.load_module() + self.run_options = self.inventory.run_arguments + self.workdir = os.path.dirname(os.path.abspath(self.inventory.script_path)) + + def run(self): + config_path = f"{self.workdir}/.dlt" + ConfigProvidersContext.initial_providers = [ + EnvironProvider(), + SecretsTomlProvider(project_dir=config_path, add_global_config=False), + ConfigTomlProvider(project_dir=config_path, add_global_config=False), + ] + + pick_first = not self.inventory.pipeline_name and not self.inventory.source_name + if pick_first: + fmt.echo( + "Neiter of pipeline name or source were specified, " + "we will pick first pipeline and a source to run" + ) + + pipeline_name = self.inventory.pipeline_name + resource_name = self.inventory.source_name + pipelines = list(self.pipelines.values()) + resources = list(self.visitor.known_sources_resources.keys()) + if pick_first: + resource = getattr(self.module, resources[0]) + pipeline_instance = pipelines[0] + else: + resource = getattr(self.module, resource_name) + pipeline_instance = self.pipelines[pipeline_name] + setattr(self.module, f"pipeline_{pipeline_name}", pipeline_instance) + + pipeline_instance.run(resource(), **self.run_options) + + @property + def run_nodes(self) -> t.List[ast.AST]: + """Extract pipeline.run nodes""" + pipeline_runs = self.visitor.known_calls_with_nodes.get(rn.RUN) + run_nodes = [run_node for _args, run_node in pipeline_runs] + return run_nodes + + @property + def sources(self) -> t.List[str]: + """Returns source and resource names""" + return self.visitor.known_sources_resources.keys() + + @property + def pipelines(self) -> t.Dict[str, dlt.Pipeline]: + pipeline_arguments: t.List[inspect.BoundArguments] = ( + self.visitor.known_calls.get("pipeline") or [] + ) + pipeline_items = defaultdict(dict) + for item in pipeline_arguments: + pipeline_options = {} + for arg_name, bound_value in item.arguments.items(): + if bound_value is not None: + if arg_name == "kwargs": + pipeline_options.update(bound_value) + continue + + if hasattr(bound_value, "value"): + value = bound_value.value + else: + value = bound_value + pipeline_options[arg_name] = value + + pipeline = dlt.pipeline(**pipeline_options) + pipeline.working_dir = os.path.dirname(os.path.abspath(self.inventory.script_path)) + pipeline_items[pipeline_options["pipeline_name"]] = pipeline + + return pipeline_items + + def strip_pipeline_runs(self) -> str: + """Removes all pipeline.run nodes and return patched source code""" + # Copy original source + script_lines: t.List[str] = self.visitor.source_lines[:] + stub = '""" run method replaced """' + + def restore_indent(line: str) -> str: + indent = "" + for ch in line: + if ch == " ": + indent += " " + return indent + + for node in self.run_nodes: + # if it is a one liner + if node.lineno == node.end_lineno: + script_lines[node.lineno] = None + else: + line_of_code = script_lines[node.lineno - 1] + script_lines[node.lineno - 1] = restore_indent(line_of_code) + stub + start = node.lineno + 1 + while start <= node.end_lineno: + script_lines[start - 1] = None + start += 1 + + result = [line.rstrip() for line in script_lines if line] + return "\n".join(result) + + def load_module(self) -> ModuleType: + spec = im.ModuleSpec(name=self.inventory.module_name, loader=None) + module = iu.module_from_spec(spec) + exec(self.pipeline_source, module.__dict__) + return module + + def raise_if_not_runnable(self) -> None: + if not self.visitor.known_calls.get("pipeline"): + raise RuntimeError("Could not find any pipeline in the given module") + + if not self.visitor.known_sources_resources: + raise RuntimeError("Could not find any source or resource in the given module") + + +class DltRunnerEnvironment: + def __init__(self, inventory: RunnerInventory) -> None: + self.inventory = inventory + visitor: PipelineScriptVisitor = parse_init_script( + "run", + self.inventory.script_contents, + self.inventory.module_name, + ) + self.runner = PipelineRunner(inventory, visitor) + + def run(self) -> None: + self.runner.run() diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index 9bd3cb6775..3ce8ab8ba6 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -78,8 +78,11 @@ def creates_func_def_name_node(func_def: ast.FunctionDef, source_lines: Sequence def rewrite_python_script( source_script_lines: List[str], transformed_nodes: List[Tuple[ast.AST, ast.AST]] ) -> List[str]: - """Replaces all the nodes present in `transformed_nodes` in the `script_lines`. The `transformed_nodes` is a tuple where the first element - is must be a node with full location information created out of `script_lines`""" + """Replaces all the nodes present in `transformed_nodes` in the `script_lines`. + + The `transformed_nodes` is a tuple where the first element must be a node with + full location information created out of `script_lines` + """ script_lines: List[str] = [] last_line = -1 last_offset = -1 diff --git a/dlt/reflection/script_visitor.py b/dlt/reflection/script_visitor.py index 52b19fe031..90129395b1 100644 --- a/dlt/reflection/script_visitor.py +++ b/dlt/reflection/script_visitor.py @@ -2,7 +2,8 @@ import ast import astunparse from ast import NodeVisitor -from typing import Any, Dict, List +from collections import defaultdict +from typing import Any, Dict, List, Tuple from dlt.common.reflection.utils import find_outer_func_def @@ -19,6 +20,7 @@ def __init__(self, source: str): # self.source_aliases: Dict[str, str] = {} self.is_destination_imported: bool = False self.known_calls: Dict[str, List[inspect.BoundArguments]] = {} + self.known_calls_with_nodes: Dict[str, List[Tuple[inspect.BoundArguments, ast.AST]]] = defaultdict(list) self.known_sources: Dict[str, ast.FunctionDef] = {} self.known_source_calls: Dict[str, List[ast.Call]] = {} self.known_resources: Dict[str, ast.FunctionDef] = {} @@ -104,6 +106,7 @@ def visit_Call(self, node: ast.Call) -> Any: # print(f"ALIAS: {alias_name} of {self.func_aliases.get(alias_name)} with {bound_args}") fun_calls = self.known_calls.setdefault(fn, []) fun_calls.append(bound_args) + self.known_calls_with_nodes[fn].append((bound_args, node)) except TypeError: # skip the signature pass From b7fa2386b9c2a102b65af8696c735da197bf410e Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Thu, 14 Mar 2024 13:44:26 +0100 Subject: [PATCH 04/59] Add more runtime information print outs --- dlt/common/cli/runner.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/dlt/common/cli/runner.py b/dlt/common/cli/runner.py index 4c88a4bd59..a7354b177f 100644 --- a/dlt/common/cli/runner.py +++ b/dlt/common/cli/runner.py @@ -3,7 +3,7 @@ import inspect import os import typing as t - +import importlib from dataclasses import dataclass from importlib import machinery as im from importlib import util as iu @@ -71,18 +71,14 @@ def __init__(self, inventory: RunnerInventory, visitor: PipelineScriptVisitor) - self.workdir = os.path.dirname(os.path.abspath(self.inventory.script_path)) def run(self): - config_path = f"{self.workdir}/.dlt" - ConfigProvidersContext.initial_providers = [ - EnvironProvider(), - SecretsTomlProvider(project_dir=config_path, add_global_config=False), - ConfigTomlProvider(project_dir=config_path, add_global_config=False), - ] - pick_first = not self.inventory.pipeline_name and not self.inventory.source_name if pick_first: fmt.echo( - "Neiter of pipeline name or source were specified, " - "we will pick first pipeline and a source to run" + fmt.style( + "Pipeline name and source not specified, " + "we will pick first pipeline and a source to run", + fg="blue", + ) ) pipeline_name = self.inventory.pipeline_name @@ -95,9 +91,22 @@ def run(self): else: resource = getattr(self.module, resource_name) pipeline_instance = self.pipelines[pipeline_name] - setattr(self.module, f"pipeline_{pipeline_name}", pipeline_instance) - pipeline_instance.run(resource(), **self.run_options) + setattr(self.module, f"pipeline_{pipeline_name}", pipeline_instance) + + fmt.echo( + fmt.style("Pipeline workdir", fg="black", bg="blue") + + f": {pipeline_instance.working_dir}" + ) + + fmt.echo(f"Runtime options for pipeline: {pipeline_instance.pipeline_name}\n") + for key, val in self.run_options.items(): + fmt.echo(f" {fmt.style(key, fg='green')}={val}") + + pipeline_instance.activate() + load_info = pipeline_instance.run(resource(), **self.run_options) + fmt.echo("") + fmt.echo(load_info) @property def run_nodes(self) -> t.List[ast.AST]: @@ -132,7 +141,7 @@ def pipelines(self) -> t.Dict[str, dlt.Pipeline]: pipeline_options[arg_name] = value pipeline = dlt.pipeline(**pipeline_options) - pipeline.working_dir = os.path.dirname(os.path.abspath(self.inventory.script_path)) + pipeline.working_dir = self.workdir pipeline_items[pipeline_options["pipeline_name"]] = pipeline return pipeline_items From f5218d910da752efb9c1fc9718d318811f5049c8 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 12:23:47 +0100 Subject: [PATCH 05/59] Refactor cli runner implementation --- dlt/cli/echo.py | 4 + dlt/cli/run_command.py | 11 +- dlt/common/cli/runner.py | 202 ----------------------- dlt/common/cli/runner/__init__.py | 10 ++ dlt/common/cli/runner/errors.py | 8 + dlt/common/cli/runner/inquirer.py | 74 +++++++++ dlt/common/cli/runner/pipeline_script.py | 112 +++++++++++++ dlt/common/cli/runner/runner.py | 40 +++++ dlt/common/cli/runner/source_patcher.py | 47 ++++++ dlt/common/cli/runner/types.py | 20 +++ 10 files changed, 321 insertions(+), 207 deletions(-) delete mode 100644 dlt/common/cli/runner.py create mode 100644 dlt/common/cli/runner/__init__.py create mode 100644 dlt/common/cli/runner/errors.py create mode 100644 dlt/common/cli/runner/inquirer.py create mode 100644 dlt/common/cli/runner/pipeline_script.py create mode 100644 dlt/common/cli/runner/runner.py create mode 100644 dlt/common/cli/runner/source_patcher.py create mode 100644 dlt/common/cli/runner/types.py diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index bd9cf24f64..68befa8079 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -37,6 +37,10 @@ def error(msg: str) -> None: click.secho("ERROR: " + msg, fg="red") +def info(msg: str) -> None: + click.secho("INFO: " + msg, fg="blue") + + def warning(msg: str) -> None: click.secho("WARNING: " + msg, fg="yellow") diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 3a61e1263e..055030d6c4 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -4,7 +4,8 @@ from dlt.cli import echo as fmt from dlt.cli.utils import track_command -from dlt.common.cli.runner import RunnerInventory, DltRunnerEnvironment +from dlt.common.cli.runner.runner import PipelineRunner +from dlt.common.cli.runner.types import RunnerInventory from dlt.sources import DltResource, DltSource from typing_extensions import TypedDict @@ -27,18 +28,18 @@ def run_pipeline_command( source: t.Optional[str] = None, args: t.Optional[str] = None, ): - pick_first_pipeline_and_source = not pipeline and not source inventory = RunnerInventory( module, pipeline_name=pipeline, source_name=source, args=args, - run_first_pipeline_with_source=pick_first_pipeline_and_source, ) try: - dlt_environment = DltRunnerEnvironment(inventory=inventory) - dlt_environment.run() + with PipelineRunner(inventory=inventory) as runner: + load_info = runner.run() + fmt.echo("") + fmt.echo(load_info) except RuntimeError as ex: fmt.echo(str(ex)) return -1 diff --git a/dlt/common/cli/runner.py b/dlt/common/cli/runner.py deleted file mode 100644 index a7354b177f..0000000000 --- a/dlt/common/cli/runner.py +++ /dev/null @@ -1,202 +0,0 @@ -import ast -from collections import defaultdict -import inspect -import os -import typing as t -import importlib -from dataclasses import dataclass -from importlib import machinery as im -from importlib import util as iu -from types import ModuleType - -import dlt - -from dlt.cli import echo as fmt -from dlt.common.configuration.providers.environ import EnvironProvider -from dlt.common.configuration.providers.toml import ConfigTomlProvider, SecretsTomlProvider -from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext -from dlt.sources import DltResource, DltSource -from dlt.cli.utils import parse_init_script -from dlt.common.reflection.utils import evaluate_node_literal -from dlt.reflection import names as rn -from dlt.reflection.script_visitor import PipelineScriptVisitor - - -@dataclass -class RunnerInventory: - """This class contains parameters passed to command - Also it provides two convenience methods to load script contents - and to get module name from provided `script_path`. - """ - - script_path: str - pipeline_name: t.Optional[str] = None - source_name: t.Optional[str] = None - run_first_pipeline_with_source: t.Optional[bool] = False - args: t.List[str] = None - - @property - def script_contents(self) -> str: - """Loads script contents""" - with open(self.script_path) as fp: - return fp.read() - - @property - def module_name(self) -> str: - """Strips extension with path and returns filename as modulename""" - module_name = self.script_path.split(os.sep)[-1] - if module_name.endswith(".py"): - module_name = module_name[:-3] - - return module_name - - @property - def run_arguments(self) -> t.Dict[str, str]: - run_options = {} - for arg in self.args or []: - arg_name, value = arg.split("=") - run_options[arg_name] = value - - return run_options - - -class PipelineRunner: - def __init__(self, inventory: RunnerInventory, visitor: PipelineScriptVisitor) -> None: - self.inventory = inventory - self.visitor = visitor - self.raise_if_not_runnable() - self.pipeline_source = self.strip_pipeline_runs() - self.module = self.load_module() - self.run_options = self.inventory.run_arguments - self.workdir = os.path.dirname(os.path.abspath(self.inventory.script_path)) - - def run(self): - pick_first = not self.inventory.pipeline_name and not self.inventory.source_name - if pick_first: - fmt.echo( - fmt.style( - "Pipeline name and source not specified, " - "we will pick first pipeline and a source to run", - fg="blue", - ) - ) - - pipeline_name = self.inventory.pipeline_name - resource_name = self.inventory.source_name - pipelines = list(self.pipelines.values()) - resources = list(self.visitor.known_sources_resources.keys()) - if pick_first: - resource = getattr(self.module, resources[0]) - pipeline_instance = pipelines[0] - else: - resource = getattr(self.module, resource_name) - pipeline_instance = self.pipelines[pipeline_name] - - setattr(self.module, f"pipeline_{pipeline_name}", pipeline_instance) - - fmt.echo( - fmt.style("Pipeline workdir", fg="black", bg="blue") - + f": {pipeline_instance.working_dir}" - ) - - fmt.echo(f"Runtime options for pipeline: {pipeline_instance.pipeline_name}\n") - for key, val in self.run_options.items(): - fmt.echo(f" {fmt.style(key, fg='green')}={val}") - - pipeline_instance.activate() - load_info = pipeline_instance.run(resource(), **self.run_options) - fmt.echo("") - fmt.echo(load_info) - - @property - def run_nodes(self) -> t.List[ast.AST]: - """Extract pipeline.run nodes""" - pipeline_runs = self.visitor.known_calls_with_nodes.get(rn.RUN) - run_nodes = [run_node for _args, run_node in pipeline_runs] - return run_nodes - - @property - def sources(self) -> t.List[str]: - """Returns source and resource names""" - return self.visitor.known_sources_resources.keys() - - @property - def pipelines(self) -> t.Dict[str, dlt.Pipeline]: - pipeline_arguments: t.List[inspect.BoundArguments] = ( - self.visitor.known_calls.get("pipeline") or [] - ) - pipeline_items = defaultdict(dict) - for item in pipeline_arguments: - pipeline_options = {} - for arg_name, bound_value in item.arguments.items(): - if bound_value is not None: - if arg_name == "kwargs": - pipeline_options.update(bound_value) - continue - - if hasattr(bound_value, "value"): - value = bound_value.value - else: - value = bound_value - pipeline_options[arg_name] = value - - pipeline = dlt.pipeline(**pipeline_options) - pipeline.working_dir = self.workdir - pipeline_items[pipeline_options["pipeline_name"]] = pipeline - - return pipeline_items - - def strip_pipeline_runs(self) -> str: - """Removes all pipeline.run nodes and return patched source code""" - # Copy original source - script_lines: t.List[str] = self.visitor.source_lines[:] - stub = '""" run method replaced """' - - def restore_indent(line: str) -> str: - indent = "" - for ch in line: - if ch == " ": - indent += " " - return indent - - for node in self.run_nodes: - # if it is a one liner - if node.lineno == node.end_lineno: - script_lines[node.lineno] = None - else: - line_of_code = script_lines[node.lineno - 1] - script_lines[node.lineno - 1] = restore_indent(line_of_code) + stub - start = node.lineno + 1 - while start <= node.end_lineno: - script_lines[start - 1] = None - start += 1 - - result = [line.rstrip() for line in script_lines if line] - return "\n".join(result) - - def load_module(self) -> ModuleType: - spec = im.ModuleSpec(name=self.inventory.module_name, loader=None) - module = iu.module_from_spec(spec) - exec(self.pipeline_source, module.__dict__) - return module - - def raise_if_not_runnable(self) -> None: - if not self.visitor.known_calls.get("pipeline"): - raise RuntimeError("Could not find any pipeline in the given module") - - if not self.visitor.known_sources_resources: - raise RuntimeError("Could not find any source or resource in the given module") - - -class DltRunnerEnvironment: - def __init__(self, inventory: RunnerInventory) -> None: - self.inventory = inventory - visitor: PipelineScriptVisitor = parse_init_script( - "run", - self.inventory.script_contents, - self.inventory.module_name, - ) - self.runner = PipelineRunner(inventory, visitor) - - def run(self) -> None: - self.runner.run() diff --git a/dlt/common/cli/runner/__init__.py b/dlt/common/cli/runner/__init__.py new file mode 100644 index 0000000000..bab52a205f --- /dev/null +++ b/dlt/common/cli/runner/__init__.py @@ -0,0 +1,10 @@ +from dlt.common.cli.runner.pipeline_script import PipelineScript +from dlt.common.cli.runner.runner import PipelineRunner +from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory + +__all__ = ( + "PipelineRunner", + "RunnerInventory", + "PipelineMembers", + "PipelineScript", +) diff --git a/dlt/common/cli/runner/errors.py b/dlt/common/cli/runner/errors.py new file mode 100644 index 0000000000..5748ec0423 --- /dev/null +++ b/dlt/common/cli/runner/errors.py @@ -0,0 +1,8 @@ +class RunnerError(Exception): + def __init__(self, message: str) -> None: + self.message = message + super().__init__(message) + + +class FriendlyExit(Exception): + pass diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py new file mode 100644 index 0000000000..8060d57b52 --- /dev/null +++ b/dlt/common/cli/runner/inquirer.py @@ -0,0 +1,74 @@ +import typing as t + +import click +import dlt + +from dlt.cli import echo as fmt +from dlt.common.cli.runner.errors import FriendlyExit +from dlt.common.cli.runner.types import PipelineMembers +from dlt.sources import DltResource, DltSource + + +select_message = """Please select your %s: +%s +""" + + +def make_select_options( + select_what: str, items: t.Dict[str, t.Any] +) -> t.Tuple[str, t.List[int], t.List[str]]: + options = [] + option_values = [] + choice_message = "" + for idx, name in enumerate(items.keys()): + choice_message += f"{idx} - {name}\n" + options.append(str(idx)) + option_values.append(name) + + choice_message += "n - exit\n" + options.append("n") + return select_message % (select_what, choice_message), options, option_values + + +class Inquirer: + """This class handles user input to allow users to select pipeline and re/sources""" + + def __init__(self, pipeline_members: PipelineMembers) -> None: + self.pipelines = pipeline_members["pipelines"] + self.sources = pipeline_members["sources"] + + def ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: + """Shows prompts to select pipeline and re/source + + Returns: + (DltResource, DltSource): a pair of pipeline and re/source + """ + # save first pipeline and source + pipeline_name, _ = next(iter(self.pipelines.items())) + source_name, _ = next(iter(self.pipelines.items())) + if len(self.pipelines) > 1: + message, options, values = make_select_options("pipeline", self.pipelines) + choice = fmt.prompt(message, options, default="n") + if choice == "n": + raise FriendlyExit() + else: + pipeline_name = values[int(choice)] + + if len(self.sources) > 1: + message, options, values = make_select_options("re/source", self.sources) + choice = fmt.prompt(message, options, default="n") + if choice == "n": + raise FriendlyExit() + else: + source_name = values[int(choice)] + + pipeline = self.pipelines[pipeline_name] + resource = self.sources[source_name] + fmt.echo("Pipeline: " + fmt.style(pipeline_name, fg="blue", underline=True)) + + if isinstance(resource, DltResource): + label = "Resource" + else: + label = "Source" + fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) + return pipeline, self.sources[source_name] diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py new file mode 100644 index 0000000000..10ab81bc05 --- /dev/null +++ b/dlt/common/cli/runner/pipeline_script.py @@ -0,0 +1,112 @@ +import os +import importlib +import inspect +import tempfile +import typing as t +import sys + +from collections import defaultdict +from types import ModuleType + +import dlt + +from dlt.cli.utils import parse_init_script +from dlt.common.cli.runner.source_patcher import SourcePatcher +from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory +from dlt.sources import DltResource, DltSource + + +COMMAND_NAME: t.Final[str] = "run" + + +class PipelineScript: + """Handles pipeline source code and prepares it to run + + Things it does + + 1. Reads the source code, + 2. Stubs all pipeline.run calls, + 2. Prepares module name + """ + + def __init__(self, inventory: RunnerInventory) -> None: + self.inventory = inventory + """This means that user didn't specify pipeline name or re/source name + + And we need to check if there is 1 pipeline and 1 re/source to run right + away if there are multiple re/sources then we need to provide a CLI prompt + """ + self.workdir = os.path.dirname(os.path.abspath(inventory.script_path)) + """Directory in which pipeline script lives""" + + # Now we need to patch and store pipeline code + visitor = parse_init_script( + COMMAND_NAME, + self.script_contents, + self.module_name, + ) + patcher = SourcePatcher(visitor) + self.source_code = patcher.patch() + + def load_module(self, script_path: str) -> ModuleType: + """Loads pipeline module from a given location""" + spec = importlib.util.spec_from_file_location(self.module_name, script_path) + module = importlib.util.module_from_spec(spec) + sys.modules[self.module_name] = module + spec.loader.exec_module(module) + return module + + @property + def pipeline_module(self) -> ModuleType: + with tempfile.NamedTemporaryFile( + mode="w+", + dir=self.workdir, + prefix="pipeline_", + suffix=".py", + ) as tm: + tm.write(self.source_code) + tm.flush() + self.module = self.load_module(tm.name) + + return self.module + + @property + def script_contents(self) -> str: + """Loads script contents""" + with open(self.inventory.script_path) as fp: + return fp.read() + + @property + def module_name(self) -> str: + """Strips extension with path and returns filename as modulename""" + module_name = self.inventory.script_path.split(os.sep)[-1] + if module_name.endswith(".py"): + module_name = module_name[:-3] + + return module_name + + @property + def pipeline_members(self) -> PipelineMembers: + """Inspect the module and return pipelines with re/sources""" + members: PipelineMembers = defaultdict(dict) + for name, value in inspect.getmembers(self.pipeline_module): + # skipe modules and private stuff + if inspect.ismodule(value) or name.startswith("_"): + continue + + if isinstance(value, dlt.Pipeline): + members["pipelines"][value.pipeline_name] = value + + if isinstance(value, (DltResource, DltSource)): + members["sources"][value.name] = value + + return members + + @property + def run_arguments(self) -> t.Dict[str, str]: + run_options = {} + for arg in self.inventory.args or []: + arg_name, value = arg.split("=") + run_options[arg_name] = value + + return run_options diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py new file mode 100644 index 0000000000..96afcb7eb1 --- /dev/null +++ b/dlt/common/cli/runner/runner.py @@ -0,0 +1,40 @@ +import typing as t + +from dlt.cli import echo as fmt +from dlt.common.cli.runner.inquirer import Inquirer +from dlt.common.pipeline import LoadInfo +from dlt.common.cli.runner.errors import FriendlyExit, RunnerError +from dlt.common.cli.runner.pipeline_script import PipelineScript +from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory + + +class PipelineRunner: + def __init__(self, inventory: RunnerInventory) -> None: + self.inventory = inventory + self.script = PipelineScript(inventory) + self.yolo = self.inventory.pipeline_name is None and self.inventory.source_name is None + + def __enter__(self) -> t.Self: + self.check_if_runnable(self.script.pipeline_members) + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + + def run(self) -> LoadInfo: + try: + inquirer = Inquirer(self.script.pipeline_members) + pipeline, resource = inquirer.ask() + except FriendlyExit: + fmt.info("Stopping...") + return + + load_info = pipeline.run(resource(), **self.script.run_arguments) + return load_info + + def check_if_runnable(self, pipeline_members: PipelineMembers) -> None: + if not pipeline_members["pipelines"]: + raise RunnerError(f"No pipelines found in {self.inventory.script_path}") + + if not pipeline_members["sources"]: + raise RunnerError(f"Could not find any source or resource {self.inventory.script_path}") diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py new file mode 100644 index 0000000000..bfae7a0c82 --- /dev/null +++ b/dlt/common/cli/runner/source_patcher.py @@ -0,0 +1,47 @@ +"""This module provides a utility to patch source code of pipeline scripts +to remove all `pipeline.run` calls to avoid any undesired side effects when +running the pipeline with given re/sources +""" +import typing as t + +from dlt.reflection import names as rn +from dlt.reflection.script_visitor import PipelineScriptVisitor + + +class SourcePatcher: + def __init__(self, visitor: PipelineScriptVisitor) -> None: + self.visitor = visitor + + def patch(self) -> str: + """Removes all pipeline.run nodes and return patched source code""" + # Extract pipeline.run nodes + pipeline_runs = self.visitor.known_calls_with_nodes.get(rn.RUN) + run_nodes = [run_node for _args, run_node in pipeline_runs] + + # Copy original source + script_lines: t.List[str] = self.visitor.source_lines[:] + stub = '""" run method replaced """' + + for node in run_nodes: + # if it is a one liner + if node.lineno == node.end_lineno: + script_lines[node.lineno] = None + else: + line_of_code = script_lines[node.lineno - 1] + script_lines[node.lineno - 1] = self.restore_indent(line_of_code) + stub + start = node.lineno + 1 + while start <= node.end_lineno: + script_lines[start - 1] = None + start += 1 + + result = [line.rstrip() for line in script_lines if line] + return "\n".join(result) + + def restore_indent(self, line: str) -> str: + indent = "" + for ch in line: + if ch != " ": + break + + indent += " " + return indent diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py new file mode 100644 index 0000000000..e05f5ba087 --- /dev/null +++ b/dlt/common/cli/runner/types.py @@ -0,0 +1,20 @@ +import typing as t + +from dataclasses import dataclass + +import dlt + +from dlt.sources import DltResource, DltSource + + +@dataclass +class RunnerInventory: + script_path: str + pipeline_name: t.Optional[str] = None + source_name: t.Optional[str] = None + args: t.List[str] = None + + +class PipelineMembers(t.TypedDict): + pipelines: t.Dict[str, dlt.Pipeline] + sources: t.Dict[str, t.Union[DltResource, DltSource]] From 9e519004d86c355f114fe4e96f3036b7a44ebf44 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 13:20:26 +0100 Subject: [PATCH 06/59] Rename RunnerInventory to RunnerParams --- dlt/common/cli/runner/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py index e05f5ba087..931e0ea3e6 100644 --- a/dlt/common/cli/runner/types.py +++ b/dlt/common/cli/runner/types.py @@ -8,7 +8,7 @@ @dataclass -class RunnerInventory: +class RunnerParams: script_path: str pipeline_name: t.Optional[str] = None source_name: t.Optional[str] = None From 756ed577868941064b9815af059600a85c368d12 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 13:20:51 +0100 Subject: [PATCH 07/59] Show warning if current directory is different from the one of pipeline script --- dlt/common/cli/runner/runner.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 96afcb7eb1..2f471aa660 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -1,3 +1,4 @@ +import os import typing as t from dlt.cli import echo as fmt @@ -5,14 +6,14 @@ from dlt.common.pipeline import LoadInfo from dlt.common.cli.runner.errors import FriendlyExit, RunnerError from dlt.common.cli.runner.pipeline_script import PipelineScript -from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory +from dlt.common.cli.runner.types import PipelineMembers, RunnerParams class PipelineRunner: - def __init__(self, inventory: RunnerInventory) -> None: - self.inventory = inventory - self.script = PipelineScript(inventory) - self.yolo = self.inventory.pipeline_name is None and self.inventory.source_name is None + def __init__(self, params: RunnerParams, current_dir: str) -> None: + self.params = params + self.current_dir = current_dir + self.script = PipelineScript(params) def __enter__(self) -> t.Self: self.check_if_runnable(self.script.pipeline_members) @@ -34,7 +35,14 @@ def run(self) -> LoadInfo: def check_if_runnable(self, pipeline_members: PipelineMembers) -> None: if not pipeline_members["pipelines"]: - raise RunnerError(f"No pipelines found in {self.inventory.script_path}") + raise RunnerError(f"No pipelines found in {self.params.script_path}") if not pipeline_members["sources"]: - raise RunnerError(f"Could not find any source or resource {self.inventory.script_path}") + raise RunnerError(f"Could not find any source or resource {self.params.script_path}") + + def check_workdir(self): + if self.current_dir != self.script.workdir: + fmt.warning( + "Current working directory is different from the pipeline script" + f" {self.script.workdir}" + ) From 01f941d7b251ddb4c5d36b6c3beafec7897d01fd Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 13:21:24 +0100 Subject: [PATCH 08/59] Cleanup runner code --- dlt/cli/run_command.py | 22 ++++------------------ dlt/common/cli/runner/__init__.py | 4 ++-- dlt/common/cli/runner/pipeline_script.py | 14 +++++++------- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 055030d6c4..5f9ce243ad 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -1,24 +1,10 @@ +import os import typing as t -import dlt - from dlt.cli import echo as fmt from dlt.cli.utils import track_command from dlt.common.cli.runner.runner import PipelineRunner -from dlt.common.cli.runner.types import RunnerInventory -from dlt.sources import DltResource, DltSource -from typing_extensions import TypedDict - - -class PipelineMember(TypedDict): - name: str - instance: t.Union[dlt.Pipeline, DltResource, DltSource] - - -class DltInventory(TypedDict): - pipelines: t.List[PipelineMember] - sources: t.List[PipelineMember] - resources: t.List[PipelineMember] +from dlt.common.cli.runner.types import RunnerParams @track_command("run", False) @@ -28,7 +14,7 @@ def run_pipeline_command( source: t.Optional[str] = None, args: t.Optional[str] = None, ): - inventory = RunnerInventory( + params = RunnerParams( module, pipeline_name=pipeline, source_name=source, @@ -36,7 +22,7 @@ def run_pipeline_command( ) try: - with PipelineRunner(inventory=inventory) as runner: + with PipelineRunner(params=params, current_dir=os.getcwd()) as runner: load_info = runner.run() fmt.echo("") fmt.echo(load_info) diff --git a/dlt/common/cli/runner/__init__.py b/dlt/common/cli/runner/__init__.py index bab52a205f..eafb741856 100644 --- a/dlt/common/cli/runner/__init__.py +++ b/dlt/common/cli/runner/__init__.py @@ -1,10 +1,10 @@ from dlt.common.cli.runner.pipeline_script import PipelineScript from dlt.common.cli.runner.runner import PipelineRunner -from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory +from dlt.common.cli.runner.types import PipelineMembers, RunnerParams __all__ = ( "PipelineRunner", - "RunnerInventory", + "RunnerParams", "PipelineMembers", "PipelineScript", ) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 10ab81bc05..d09c9406d0 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -12,7 +12,7 @@ from dlt.cli.utils import parse_init_script from dlt.common.cli.runner.source_patcher import SourcePatcher -from dlt.common.cli.runner.types import PipelineMembers, RunnerInventory +from dlt.common.cli.runner.types import PipelineMembers, RunnerParams from dlt.sources import DltResource, DltSource @@ -29,14 +29,14 @@ class PipelineScript: 2. Prepares module name """ - def __init__(self, inventory: RunnerInventory) -> None: - self.inventory = inventory + def __init__(self, params: RunnerParams) -> None: + self.params = params """This means that user didn't specify pipeline name or re/source name And we need to check if there is 1 pipeline and 1 re/source to run right away if there are multiple re/sources then we need to provide a CLI prompt """ - self.workdir = os.path.dirname(os.path.abspath(inventory.script_path)) + self.workdir = os.path.dirname(os.path.abspath(params.script_path)) """Directory in which pipeline script lives""" # Now we need to patch and store pipeline code @@ -73,13 +73,13 @@ def pipeline_module(self) -> ModuleType: @property def script_contents(self) -> str: """Loads script contents""" - with open(self.inventory.script_path) as fp: + with open(self.params.script_path) as fp: return fp.read() @property def module_name(self) -> str: """Strips extension with path and returns filename as modulename""" - module_name = self.inventory.script_path.split(os.sep)[-1] + module_name = self.params.script_path.split(os.sep)[-1] if module_name.endswith(".py"): module_name = module_name[:-3] @@ -105,7 +105,7 @@ def pipeline_members(self) -> PipelineMembers: @property def run_arguments(self) -> t.Dict[str, str]: run_options = {} - for arg in self.inventory.args or []: + for arg in self.params.args or []: arg_name, value = arg.split("=") run_options[arg_name] = value From 6b1855ab38ebf1425feea57eb5d15bc884a83a96 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 13:22:42 +0100 Subject: [PATCH 09/59] Adjust pipeline run argument builder --- dlt/common/cli/runner/pipeline_script.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index d09c9406d0..4db4a780dc 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -105,7 +105,10 @@ def pipeline_members(self) -> PipelineMembers: @property def run_arguments(self) -> t.Dict[str, str]: run_options = {} - for arg in self.params.args or []: + if not self.params.args: + return run_options + + for arg in self.params.args: arg_name, value = arg.split("=") run_options[arg_name] = value From bc564e3ee25737fbdc6222b6598fedc095ef1b06 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 17:37:20 +0100 Subject: [PATCH 10/59] Add echo.error_style helper --- dlt/cli/echo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index 68befa8079..1961d7b561 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -33,6 +33,10 @@ def warning_style(msg: str) -> str: return click.style(msg, fg="yellow", reset=True) +def error_style(msg: str) -> str: + return click.style(msg, fg="red", reset=True) + + def error(msg: str) -> None: click.secho("ERROR: " + msg, fg="red") From 4f550315dd7d51595cb32d58314354feb3de326f Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 17:37:57 +0100 Subject: [PATCH 11/59] Move preflight checks into inquiry helper --- dlt/cli/_dlt.py | 2 +- dlt/cli/run_command.py | 16 ++++++-- dlt/common/cli/runner/errors.py | 4 ++ dlt/common/cli/runner/inquirer.py | 64 ++++++++++++++++++++++++++----- dlt/common/cli/runner/runner.py | 37 ++++-------------- dlt/common/cli/runner/types.py | 3 ++ 6 files changed, 84 insertions(+), 42 deletions(-) diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index e252657efa..7990b3b3ba 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -610,7 +610,7 @@ def main() -> int: if args.command == "schema": return schema_command_wrapper(args.file, args.format, args.remove_defaults) elif args.command == "run": - return run_pipeline_command(args.module, args.pipeline, args.source, args.args) + return run_pipeline_command(args.module, args.pipeline, args.source, None, args.args) elif args.command == "pipeline": if args.list_pipelines: return pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 5f9ce243ad..0c5d448f70 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -3,8 +3,10 @@ from dlt.cli import echo as fmt from dlt.cli.utils import track_command +from dlt.common.cli.runner.errors import FriendlyExit, PreflightError, RunnerError from dlt.common.cli.runner.runner import PipelineRunner from dlt.common.cli.runner.types import RunnerParams +from dlt.pipeline.exceptions import PipelineStepFailed @track_command("run", False) @@ -12,20 +14,28 @@ def run_pipeline_command( module: str, pipeline: t.Optional[str] = None, source: t.Optional[str] = None, + config_path: t.Optional[str] = None, args: t.Optional[str] = None, ): params = RunnerParams( module, + current_dir=os.getcwd(), pipeline_name=pipeline, source_name=source, + config_path=config_path, args=args, ) try: - with PipelineRunner(params=params, current_dir=os.getcwd()) as runner: + with PipelineRunner(params=params) as runner: load_info = runner.run() fmt.echo("") fmt.echo(load_info) - except RuntimeError as ex: - fmt.echo(str(ex)) + except PipelineStepFailed: + raise + except RunnerError as ex: + fmt.echo(ex.message) return -1 + except (FriendlyExit, PreflightError): + fmt.info("Stopping...") + return 0 diff --git a/dlt/common/cli/runner/errors.py b/dlt/common/cli/runner/errors.py index 5748ec0423..1816a3c7aa 100644 --- a/dlt/common/cli/runner/errors.py +++ b/dlt/common/cli/runner/errors.py @@ -6,3 +6,7 @@ def __init__(self, message: str) -> None: class FriendlyExit(Exception): pass + + +class PreflightError(Exception): + pass diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 8060d57b52..f353a4b9fa 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -1,14 +1,15 @@ +import os import typing as t - -import click +import textwrap as tw import dlt from dlt.cli import echo as fmt -from dlt.common.cli.runner.errors import FriendlyExit -from dlt.common.cli.runner.types import PipelineMembers +from dlt.common.cli.runner.errors import FriendlyExit, PreflightError, RunnerError +from dlt.common.cli.runner.types import PipelineMembers, RunnerParams from dlt.sources import DltResource, DltSource +dot_bold = ".dlt" select_message = """Please select your %s: %s """ @@ -31,13 +32,16 @@ def make_select_options( class Inquirer: - """This class handles user input to allow users to select pipeline and re/sources""" + """This class does pre flight checks required to run the pipeline. + Also handles user input to allow users to select pipeline and re/sources. + """ - def __init__(self, pipeline_members: PipelineMembers) -> None: - self.pipelines = pipeline_members["pipelines"] - self.sources = pipeline_members["sources"] + def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> None: + self.params = params + self.pipelines = pipeline_members.get("pipelines") + self.sources = pipeline_members.get("sources") - def ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: + def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """Shows prompts to select pipeline and re/source Returns: @@ -72,3 +76,45 @@ def ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: label = "Source" fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) return pipeline, self.sources[source_name] + + def preflight_checks(self): + if self.params.current_dir != self.params.pipeline_workdir: + fmt.warning( + "Current working directory is different from the pipeline script" + + self.params.pipeline_workdir + ) + fmt.echo(f"Current workdir: {fmt.style(self.params.current_dir, fg='blue')}") + fmt.echo(f"Pipeline workdir: {fmt.style(self.params.pipeline_workdir, fg='blue')}") + + has_cwd_config = self.has_dlt_config(self.params.current_dir) + has_pipeline_config = self.has_dlt_config(self.params.pipeline_workdir) + if has_cwd_config and has_pipeline_config: + message = tw.dedent( + f""" + Found {dot_bold} in current directory and pipeline directory if you intended to + use {self.params.pipeline_workdir}/{dot_bold}, please change your current directory. + + Using {dot_bold} in current directory {self.params.current_dir}/{dot_bold}. + """, + ) + fmt.echo(fmt.warning_style(message)) + elif not has_cwd_config and has_pipeline_config: + fmt.error( + f"{dot_bold} is missing in current directory but exists in pipeline script's" + " directory" + ) + fmt.info( + f"Please change your current directory to {self.params.pipeline_workdir} and" + " try again" + ) + raise PreflightError() + + def check_if_runnable(self) -> None: + if not self.pipelines: + raise RunnerError(f"No pipelines found in {self.params.script_path}") + + if not self.sources: + raise RunnerError(f"Could not find any source or resource {self.params.script_path}") + + def has_dlt_config(self, path: str) -> bool: + return os.path.exists(os.path.join(path, ".dlt")) diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 2f471aa660..d9f6a7d131 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -1,48 +1,27 @@ -import os import typing as t -from dlt.cli import echo as fmt from dlt.common.cli.runner.inquirer import Inquirer from dlt.common.pipeline import LoadInfo -from dlt.common.cli.runner.errors import FriendlyExit, RunnerError from dlt.common.cli.runner.pipeline_script import PipelineScript -from dlt.common.cli.runner.types import PipelineMembers, RunnerParams +from dlt.common.cli.runner.types import RunnerParams class PipelineRunner: - def __init__(self, params: RunnerParams, current_dir: str) -> None: + def __init__(self, params: RunnerParams) -> None: self.params = params - self.current_dir = current_dir self.script = PipelineScript(params) + self.params.pipeline_workdir = self.script.workdir + self.inquirer = Inquirer(self.params, self.script.pipeline_members) def __enter__(self) -> t.Self: - self.check_if_runnable(self.script.pipeline_members) + self.inquirer.preflight_checks() + self.inquirer.check_if_runnable() + self.pipeline, self.resource = self.inquirer.maybe_ask() return self def __exit__(self, exc_type, exc_value, traceback): pass def run(self) -> LoadInfo: - try: - inquirer = Inquirer(self.script.pipeline_members) - pipeline, resource = inquirer.ask() - except FriendlyExit: - fmt.info("Stopping...") - return - - load_info = pipeline.run(resource(), **self.script.run_arguments) + load_info = self.pipeline.run(self.resource(), **self.script.run_arguments) return load_info - - def check_if_runnable(self, pipeline_members: PipelineMembers) -> None: - if not pipeline_members["pipelines"]: - raise RunnerError(f"No pipelines found in {self.params.script_path}") - - if not pipeline_members["sources"]: - raise RunnerError(f"Could not find any source or resource {self.params.script_path}") - - def check_workdir(self): - if self.current_dir != self.script.workdir: - fmt.warning( - "Current working directory is different from the pipeline script" - f" {self.script.workdir}" - ) diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py index 931e0ea3e6..f4410d649c 100644 --- a/dlt/common/cli/runner/types.py +++ b/dlt/common/cli/runner/types.py @@ -10,8 +10,11 @@ @dataclass class RunnerParams: script_path: str + current_dir: str + pipeline_workdir: t.Optional[str] = None pipeline_name: t.Optional[str] = None source_name: t.Optional[str] = None + config_path: t.Optional[str] = None args: t.List[str] = None From 584f4f28a8b82038f193af1b4526b3e0a0422f10 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:35:31 +0100 Subject: [PATCH 12/59] Inspect module.__dict__ to extract objects instead of inspect module --- dlt/common/cli/runner/pipeline_script.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 4db4a780dc..95c4c41ea6 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -1,6 +1,5 @@ import os import importlib -import inspect import tempfile import typing as t import sys @@ -89,16 +88,18 @@ def module_name(self) -> str: def pipeline_members(self) -> PipelineMembers: """Inspect the module and return pipelines with re/sources""" members: PipelineMembers = defaultdict(dict) - for name, value in inspect.getmembers(self.pipeline_module): - # skipe modules and private stuff - if inspect.ismodule(value) or name.startswith("_"): + for name, value in self.pipeline_module.__dict__.items(): + # skip modules and private stuff + if isinstance(value, ModuleType) or name.startswith("_"): continue + # take pipelines by their names if isinstance(value, dlt.Pipeline): members["pipelines"][value.pipeline_name] = value + # take source and resources by their variable name if isinstance(value, (DltResource, DltSource)): - members["sources"][value.name] = value + members["sources"][name] = value return members From 6ff3106850e716b449b75f13831db892c7a94828 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:36:00 +0100 Subject: [PATCH 13/59] Adjust warning message --- dlt/common/cli/runner/inquirer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index f353a4b9fa..76b5d4ee36 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -80,8 +80,8 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: def preflight_checks(self): if self.params.current_dir != self.params.pipeline_workdir: fmt.warning( - "Current working directory is different from the pipeline script" - + self.params.pipeline_workdir + "Current working directory is different from the " + f"pipeline script {self.params.pipeline_workdir}\n" ) fmt.echo(f"Current workdir: {fmt.style(self.params.current_dir, fg='blue')}") fmt.echo(f"Pipeline workdir: {fmt.style(self.params.pipeline_workdir, fg='blue')}") From 4cc4f30129369a216c5508ffe87d5935291618f6 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:36:17 +0100 Subject: [PATCH 14/59] Check if selected resource has been called --- dlt/cli/run_command.py | 1 + dlt/common/cli/runner/runner.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 0c5d448f70..e07851e33a 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -30,6 +30,7 @@ def run_pipeline_command( with PipelineRunner(params=params) as runner: load_info = runner.run() fmt.echo("") + # FIXME: provide more useful information fmt.echo(load_info) except PipelineStepFailed: raise diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index d9f6a7d131..6d879a04eb 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -4,6 +4,7 @@ from dlt.common.pipeline import LoadInfo from dlt.common.cli.runner.pipeline_script import PipelineScript from dlt.common.cli.runner.types import RunnerParams +from dlt.extract.resource import DltResource class PipelineRunner: @@ -23,5 +24,9 @@ def __exit__(self, exc_type, exc_value, traceback): pass def run(self) -> LoadInfo: - load_info = self.pipeline.run(self.resource(), **self.script.run_arguments) + data_resource = self.resource + if isinstance(self.resource, DltResource) and not self.resource._args_bound: + data_resource = self.resource() + + load_info = self.pipeline.run(data_resource, **self.script.run_arguments) return load_info From 73c7db4c5ea7a5f577ec18410eb7827d3553d0b5 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:45:43 +0100 Subject: [PATCH 15/59] Update docstrings --- dlt/common/cli/runner/pipeline_script.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 95c4c41ea6..82601baef6 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -25,7 +25,10 @@ class PipelineScript: 1. Reads the source code, 2. Stubs all pipeline.run calls, - 2. Prepares module name + 3. Creates a temporary module in the location of pipeline file, + 4. Imports rewritten module, + 5. Prepares module name, + 6. Exposes ready to use module instance. """ def __init__(self, params: RunnerParams) -> None: From 9cf559fad4c603271f88b77e24c054e7ecc60a68 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:53:39 +0100 Subject: [PATCH 16/59] Adjust docstrings and prompt messages --- dlt/common/cli/runner/inquirer.py | 8 ++++---- dlt/common/cli/runner/pipeline_script.py | 8 ++++---- dlt/common/cli/runner/source_patcher.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 76b5d4ee36..da4199e009 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -33,7 +33,7 @@ def make_select_options( class Inquirer: """This class does pre flight checks required to run the pipeline. - Also handles user input to allow users to select pipeline and re/sources. + Also handles user input to allow users to select pipeline, resources and sources. """ def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> None: @@ -42,10 +42,10 @@ def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> N self.sources = pipeline_members.get("sources") def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: - """Shows prompts to select pipeline and re/source + """Shows prompts to select pipeline, resources and sources Returns: - (DltResource, DltSource): a pair of pipeline and re/source + (DltResource, DltSource): a pair of pipeline and resources and sources """ # save first pipeline and source pipeline_name, _ = next(iter(self.pipelines.items())) @@ -59,7 +59,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: pipeline_name = values[int(choice)] if len(self.sources) > 1: - message, options, values = make_select_options("re/source", self.sources) + message, options, values = make_select_options("resource or source", self.sources) choice = fmt.prompt(message, options, default="n") if choice == "n": raise FriendlyExit() diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 82601baef6..0487629a08 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -33,10 +33,10 @@ class PipelineScript: def __init__(self, params: RunnerParams) -> None: self.params = params - """This means that user didn't specify pipeline name or re/source name + """This means that user didn't specify pipeline name or resource and source name - And we need to check if there is 1 pipeline and 1 re/source to run right - away if there are multiple re/sources then we need to provide a CLI prompt + And we need to check if there is 1 pipeline and 1 resource or source to run right + away if there are multiple resources and sources then we need to provide a CLI prompt """ self.workdir = os.path.dirname(os.path.abspath(params.script_path)) """Directory in which pipeline script lives""" @@ -89,7 +89,7 @@ def module_name(self) -> str: @property def pipeline_members(self) -> PipelineMembers: - """Inspect the module and return pipelines with re/sources""" + """Inspect the module and return pipelines with resources and sources""" members: PipelineMembers = defaultdict(dict) for name, value in self.pipeline_module.__dict__.items(): # skip modules and private stuff diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py index bfae7a0c82..03465a3dcf 100644 --- a/dlt/common/cli/runner/source_patcher.py +++ b/dlt/common/cli/runner/source_patcher.py @@ -1,6 +1,6 @@ """This module provides a utility to patch source code of pipeline scripts to remove all `pipeline.run` calls to avoid any undesired side effects when -running the pipeline with given re/sources +running the pipeline with given resources and sources """ import typing as t From e456f267b5d9e421718573ccb488525e15676521 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 15 Mar 2024 18:55:38 +0100 Subject: [PATCH 17/59] Rename a variable --- dlt/common/cli/runner/inquirer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index da4199e009..06d4b161bc 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -9,7 +9,7 @@ from dlt.sources import DltResource, DltSource -dot_bold = ".dlt" +dot_dlt = ".dlt" select_message = """Please select your %s: %s """ @@ -91,16 +91,16 @@ def preflight_checks(self): if has_cwd_config and has_pipeline_config: message = tw.dedent( f""" - Found {dot_bold} in current directory and pipeline directory if you intended to - use {self.params.pipeline_workdir}/{dot_bold}, please change your current directory. + Found {dot_dlt} in current directory and pipeline directory if you intended to + use {self.params.pipeline_workdir}/{dot_dlt}, please change your current directory. - Using {dot_bold} in current directory {self.params.current_dir}/{dot_bold}. + Using {dot_dlt} in current directory {self.params.current_dir}/{dot_dlt}. """, ) fmt.echo(fmt.warning_style(message)) elif not has_cwd_config and has_pipeline_config: fmt.error( - f"{dot_bold} is missing in current directory but exists in pipeline script's" + f"{dot_dlt} is missing in current directory but exists in pipeline script's" " directory" ) fmt.info( From ff464e39aa22a5749180e278daf76313540c597c Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 18 Mar 2024 09:25:46 +0100 Subject: [PATCH 18/59] Fix mypy errors --- dlt/cli/run_command.py | 7 ++++--- dlt/common/cli/runner/inquirer.py | 4 ++-- dlt/common/cli/runner/pipeline_script.py | 4 ++-- dlt/common/cli/runner/runner.py | 14 ++++++++++---- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index e07851e33a..7c3120e131 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -15,8 +15,8 @@ def run_pipeline_command( pipeline: t.Optional[str] = None, source: t.Optional[str] = None, config_path: t.Optional[str] = None, - args: t.Optional[str] = None, -): + args: t.Optional[t.List[str]] = None, +) -> int: params = RunnerParams( module, current_dir=os.getcwd(), @@ -39,4 +39,5 @@ def run_pipeline_command( return -1 except (FriendlyExit, PreflightError): fmt.info("Stopping...") - return 0 + + return 0 diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 06d4b161bc..30a5506f33 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -17,7 +17,7 @@ def make_select_options( select_what: str, items: t.Dict[str, t.Any] -) -> t.Tuple[str, t.List[int], t.List[str]]: +) -> t.Tuple[str, t.List[str], t.List[str]]: options = [] option_values = [] choice_message = "" @@ -77,7 +77,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) return pipeline, self.sources[source_name] - def preflight_checks(self): + def preflight_checks(self) -> None: if self.params.current_dir != self.params.pipeline_workdir: fmt.warning( "Current working directory is different from the " diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 0487629a08..0b1a23e4f3 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -90,7 +90,7 @@ def module_name(self) -> str: @property def pipeline_members(self) -> PipelineMembers: """Inspect the module and return pipelines with resources and sources""" - members: PipelineMembers = defaultdict(dict) + members: PipelineMembers = defaultdict(dict) # type: ignore[assignment] for name, value in self.pipeline_module.__dict__.items(): # skip modules and private stuff if isinstance(value, ModuleType) or name.startswith("_"): @@ -108,7 +108,7 @@ def pipeline_members(self) -> PipelineMembers: @property def run_arguments(self) -> t.Dict[str, str]: - run_options = {} + run_options: t.Dict[str, str] = {} if not self.params.args: return run_options diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 6d879a04eb..4375675ffc 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -1,5 +1,6 @@ import typing as t - +from typing_extensions import Self +from types import TracebackType from dlt.common.cli.runner.inquirer import Inquirer from dlt.common.pipeline import LoadInfo from dlt.common.cli.runner.pipeline_script import PipelineScript @@ -14,13 +15,18 @@ def __init__(self, params: RunnerParams) -> None: self.params.pipeline_workdir = self.script.workdir self.inquirer = Inquirer(self.params, self.script.pipeline_members) - def __enter__(self) -> t.Self: + def __enter__(self) -> Self: self.inquirer.preflight_checks() self.inquirer.check_if_runnable() self.pipeline, self.resource = self.inquirer.maybe_ask() return self - def __exit__(self, exc_type, exc_value, traceback): + def __exit__( + self, + exc_type: t.Optional[t.Type[BaseException]] = None, + exc_value: t.Optional[BaseException] = None, + traceback: t.Optional[TracebackType] = None, + ) -> None: pass def run(self) -> LoadInfo: @@ -28,5 +34,5 @@ def run(self) -> LoadInfo: if isinstance(self.resource, DltResource) and not self.resource._args_bound: data_resource = self.resource() - load_info = self.pipeline.run(data_resource, **self.script.run_arguments) + load_info = self.pipeline.run(data_resource, **self.script.run_arguments) # type: ignore[arg-type] return load_info From 66311e81f9d872c6ec306ed42a7098cb8bd254d7 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 18 Mar 2024 09:31:49 +0100 Subject: [PATCH 19/59] Fix linting issues --- dlt/common/cli/runner/pipeline_script.py | 2 +- dlt/common/cli/runner/runner.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 0b1a23e4f3..2610d7692d 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -75,7 +75,7 @@ def pipeline_module(self) -> ModuleType: @property def script_contents(self) -> str: """Loads script contents""" - with open(self.params.script_path) as fp: + with open(self.params.script_path, encoding="utf-8") as fp: return fp.read() @property diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 4375675ffc..5112c7a1f4 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -1,6 +1,8 @@ import typing as t + from typing_extensions import Self from types import TracebackType + from dlt.common.cli.runner.inquirer import Inquirer from dlt.common.pipeline import LoadInfo from dlt.common.cli.runner.pipeline_script import PipelineScript From 3c8c77ac8e1d906a1ca797730368d795f3ecc745 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 18 Mar 2024 10:53:12 +0100 Subject: [PATCH 20/59] Add first test for run command --- tests/cli/test_run_command.py | 31 +++++++++++++++++++++++++++++++ tests/conftest.py | 1 + tests/utils.py | 2 ++ 3 files changed, 34 insertions(+) create mode 100644 tests/cli/test_run_command.py diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py new file mode 100644 index 0000000000..6bc871c18c --- /dev/null +++ b/tests/cli/test_run_command.py @@ -0,0 +1,31 @@ +import io +import os +import contextlib +import pytest +import logging +from subprocess import CalledProcessError + +import dlt +from dlt.common.runners.venv import Venv +from dlt.common.storages.file_storage import FileStorage + +from dlt.cli import echo, run_command + +from tests.utils import TESTS_ROOT + +DEPLOY_PIPELINE = TESTS_ROOT / "cli/cases/deploy_pipeline" +DUMMY_PIPELINE = DEPLOY_PIPELINE / "dummy_pipeline.py" + + +def test_run_command_requires_working_directory_same_as_pipeline_working_directory(tmp_path): + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + run_command.run_pipeline_command( + str(DUMMY_PIPELINE), + "p", + "example_resource", + None, + ["write_disposition=merge", "loader_file_format=jsonl"], + ) + + output = buf.getvalue() + assert "WARNING: Current working directory is different from the pipeline script" in output diff --git a/tests/conftest.py b/tests/conftest.py index ab22d6ca7a..3600227f3a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import os import dataclasses import logging +from pathlib import Path from typing import List # patch which providers to enable diff --git a/tests/utils.py b/tests/utils.py index 00523486ea..aa5849cf72 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,6 @@ import multiprocessing import os +from pathlib import Path import platform import sys from os import environ @@ -29,6 +30,7 @@ from dlt.common.pipeline import PipelineContext, SupportsPipeline TEST_STORAGE_ROOT = "_storage" +TESTS_ROOT = Path(__file__).parent.absolute() # destination constants From 9eb8b2a64dcfd140c90b3768a1cf3d5ab4978239 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Wed, 20 Mar 2024 13:32:23 +0100 Subject: [PATCH 21/59] Remove config path from params --- dlt/cli/run_command.py | 2 -- dlt/common/cli/runner/types.py | 1 - 2 files changed, 3 deletions(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 7c3120e131..ee3ac4e145 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -14,7 +14,6 @@ def run_pipeline_command( module: str, pipeline: t.Optional[str] = None, source: t.Optional[str] = None, - config_path: t.Optional[str] = None, args: t.Optional[t.List[str]] = None, ) -> int: params = RunnerParams( @@ -22,7 +21,6 @@ def run_pipeline_command( current_dir=os.getcwd(), pipeline_name=pipeline, source_name=source, - config_path=config_path, args=args, ) diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py index f4410d649c..df008085a0 100644 --- a/dlt/common/cli/runner/types.py +++ b/dlt/common/cli/runner/types.py @@ -14,7 +14,6 @@ class RunnerParams: pipeline_workdir: t.Optional[str] = None pipeline_name: t.Optional[str] = None source_name: t.Optional[str] = None - config_path: t.Optional[str] = None args: t.List[str] = None From b3631f4f770c39d97a9d18af7c7f0ff2e19f8678 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Wed, 20 Mar 2024 14:10:07 +0100 Subject: [PATCH 22/59] Create separate pipeline case for cli runner --- dlt/cli/_dlt.py | 15 ++++++++++++++- tests/cli/cases/cli_runner/.dlt/config.toml | 4 ++++ tests/cli/cases/cli_runner/__init__.py | 0 tests/cli/cases/cli_runner/pipeline.py | 13 +++++++++++++ tests/cli/test_run_command.py | 9 ++++----- 5 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 tests/cli/cases/cli_runner/.dlt/config.toml create mode 100644 tests/cli/cases/cli_runner/__init__.py create mode 100644 tests/cli/cases/cli_runner/pipeline.py diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 7990b3b3ba..e1cb551e42 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -586,6 +586,14 @@ def main() -> int: help="Source or resource name", ) + # TODO: enable once pipeline.run with full refresh option is available + # run_cmd.add_argument( + # "--full-refresh", + # action="store_true", + # default=False, + # help="When used pipeline will run in full-refresh mode", + # ) + run_cmd.add_argument( "--args", "-a", @@ -610,7 +618,12 @@ def main() -> int: if args.command == "schema": return schema_command_wrapper(args.file, args.format, args.remove_defaults) elif args.command == "run": - return run_pipeline_command(args.module, args.pipeline, args.source, None, args.args) + return run_pipeline_command( + args.module, + args.pipeline, + args.source, + args.args, + ) elif args.command == "pipeline": if args.list_pipelines: return pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity) diff --git a/tests/cli/cases/cli_runner/.dlt/config.toml b/tests/cli/cases/cli_runner/.dlt/config.toml new file mode 100644 index 0000000000..54a4d376fa --- /dev/null +++ b/tests/cli/cases/cli_runner/.dlt/config.toml @@ -0,0 +1,4 @@ +restore_from_destination = false # block restoring from destination + +[runtime] +log_level = "WARNING" # the system log level of dlt diff --git a/tests/cli/cases/cli_runner/__init__.py b/tests/cli/cases/cli_runner/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py new file mode 100644 index 0000000000..f9c0a2be37 --- /dev/null +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -0,0 +1,13 @@ +import dlt + + +@dlt.resource +def squares_resource(): + for idx in range(10): + yield {"id": idx, "square": idx * idx} + + +if __name__ == "__main__": + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination="dummy") + load_info = p.run(squares_resource) + print(load_info) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 6bc871c18c..665a87e8b7 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -13,17 +13,16 @@ from tests.utils import TESTS_ROOT -DEPLOY_PIPELINE = TESTS_ROOT / "cli/cases/deploy_pipeline" -DUMMY_PIPELINE = DEPLOY_PIPELINE / "dummy_pipeline.py" +RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" +DUMMY_PIPELINE = RUNNER_PIPELINES / "pipeline.py" def test_run_command_requires_working_directory_same_as_pipeline_working_directory(tmp_path): with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( str(DUMMY_PIPELINE), - "p", - "example_resource", - None, + "dummy_pipeline", + "squares_resource", ["write_disposition=merge", "loader_file_format=jsonl"], ) From 339fd634fc8f245cff5d2005712fda07924ec982 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Thu, 21 Mar 2024 15:51:05 +0100 Subject: [PATCH 23/59] Patch sys.argv instead of copying --- .../test_streamlit_show_resources.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/helpers/streamlit_tests/test_streamlit_show_resources.py b/tests/helpers/streamlit_tests/test_streamlit_show_resources.py index dd807260fe..02d86e039e 100644 --- a/tests/helpers/streamlit_tests/test_streamlit_show_resources.py +++ b/tests/helpers/streamlit_tests/test_streamlit_show_resources.py @@ -8,6 +8,7 @@ import os import sys from pathlib import Path +from unittest import mock import pytest @@ -152,13 +153,12 @@ def test_render_with_pipeline_with_different_pipeline_dirs(): def dummy_render(pipeline: dlt.Pipeline) -> None: pass - old_args = sys.argv[:] - with pytest.raises(CannotRestorePipelineException): - sys.argv = [*base_args, "/run/dlt"] + with pytest.raises(CannotRestorePipelineException), mock.patch.object( + sys, "argv", [*base_args, "/run/dlt"] + ): render_with_pipeline(dummy_render) - with pytest.raises(CannotRestorePipelineException): - sys.argv = [*base_args, "/tmp/dlt"] + with pytest.raises(CannotRestorePipelineException), mock.patch.object( + sys, "argv", [*base_args, "/tmp/dlt"] + ): render_with_pipeline(dummy_render) - - sys.argv = old_args From a35279b43e741077d9fecbbccc3930375183cb69 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Thu, 21 Mar 2024 16:51:14 +0100 Subject: [PATCH 24/59] Name variables properly and add docstrings to run_pipeline_command --- dlt/cli/_dlt.py | 4 ++-- dlt/cli/run_command.py | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index e1cb551e42..b8e0801095 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -573,7 +573,7 @@ def main() -> int: ) run_cmd.add_argument( - "pipeline", + "pipeline_name", type=str, nargs="?", help="Pipeline name", @@ -620,7 +620,7 @@ def main() -> int: elif args.command == "run": return run_pipeline_command( args.module, - args.pipeline, + args.pipeline_name, args.source, args.args, ) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index ee3ac4e145..b2cca8fb03 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -1,5 +1,5 @@ import os -import typing as t +from typing import List, Optional from dlt.cli import echo as fmt from dlt.cli.utils import track_command @@ -12,15 +12,26 @@ @track_command("run", False) def run_pipeline_command( module: str, - pipeline: t.Optional[str] = None, - source: t.Optional[str] = None, - args: t.Optional[t.List[str]] = None, + pipeline_name: Optional[str] = None, + source_name: Optional[str] = None, + args: Optional[List[str]] = None, ) -> int: + """Run the given module if any pipeline and sources or resources exist in it + + Args: + module (str): path to python module or file with dlt artifacts + pipeline_name (Optional[str]): Pipeline name + source_name (Optional[str]): Source or resource name + args (Optiona[List[str]]): List of arguments to `pipeline.run` method + + Returns: + (int): exit code + """ params = RunnerParams( module, current_dir=os.getcwd(), - pipeline_name=pipeline, - source_name=source, + pipeline_name=pipeline_name, + source_name=source_name, args=args, ) From 7cbc087e227ab755cf2cf340499f7bd973e006a6 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Thu, 21 Mar 2024 18:06:11 +0100 Subject: [PATCH 25/59] Extract reusable pieces --- dlt/common/cli/runner/inquirer.py | 37 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 30a5506f33..915865fad5 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -49,22 +49,12 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """ # save first pipeline and source pipeline_name, _ = next(iter(self.pipelines.items())) - source_name, _ = next(iter(self.pipelines.items())) + source_name, _ = next(iter(self.sources.items())) if len(self.pipelines) > 1: - message, options, values = make_select_options("pipeline", self.pipelines) - choice = fmt.prompt(message, options, default="n") - if choice == "n": - raise FriendlyExit() - else: - pipeline_name = values[int(choice)] + pipeline_name = self.ask_for_pipeline() if len(self.sources) > 1: - message, options, values = make_select_options("resource or source", self.sources) - choice = fmt.prompt(message, options, default="n") - if choice == "n": - raise FriendlyExit() - else: - source_name = values[int(choice)] + source_name = self.ask_for_source() pipeline = self.pipelines[pipeline_name] resource = self.sources[source_name] @@ -77,6 +67,27 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) return pipeline, self.sources[source_name] + def ask_for_pipeline(self) -> str: + if len(self.pipelines) > 1: + message, options, values = make_select_options("pipeline", self.pipelines) + + choice = fmt.prompt(message, options, default="n") + if choice == "n": + raise FriendlyExit() + + pipeline_name = values[int(choice)] + return pipeline_name + + def ask_for_source(self) -> str: + message, options, values = make_select_options("resource or source", self.sources) + + choice = fmt.prompt(message, options, default="n") + if choice == "n": + raise FriendlyExit() + + source_name = values[int(choice)] + return source_name + def preflight_checks(self) -> None: if self.params.current_dir != self.params.pipeline_workdir: fmt.warning( From 1434c279b63170ecb662546f2a271c9afaf47089 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 10:57:31 +0100 Subject: [PATCH 26/59] Refactor preflight checks --- dlt/common/cli/runner/inquirer.py | 33 ++++++++++++++++++++---- dlt/common/cli/runner/pipeline_script.py | 2 ++ dlt/common/cli/runner/runner.py | 2 -- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 915865fad5..e1f86112ee 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -40,6 +40,7 @@ def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> N self.params = params self.pipelines = pipeline_members.get("pipelines") self.sources = pipeline_members.get("sources") + self.preflight_checks() def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """Shows prompts to select pipeline, resources and sources @@ -50,10 +51,14 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: # save first pipeline and source pipeline_name, _ = next(iter(self.pipelines.items())) source_name, _ = next(iter(self.sources.items())) - if len(self.pipelines) > 1: + if self.params.pipeline_name: + pipeline_name = self.params.pipeline_name + elif len(self.pipelines) > 1: pipeline_name = self.ask_for_pipeline() - if len(self.sources) > 1: + if self.params.pipeline_name: + source_name = self.params.pipeline_name + elif len(self.sources) > 1: source_name = self.ask_for_source() pipeline = self.pipelines[pipeline_name] @@ -64,8 +69,9 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: label = "Resource" else: label = "Source" + fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) - return pipeline, self.sources[source_name] + return pipeline, resource def ask_for_pipeline(self) -> str: if len(self.pipelines) > 1: @@ -89,6 +95,19 @@ def ask_for_source(self) -> str: return source_name def preflight_checks(self) -> None: + if pipeline_name := self.params.pipeline_name: + if pipeline_name not in self.pipelines: + fmt.warning(f"Pipeline {pipeline_name} has not been found in pipeline script") + raise PreflightError() + + if source_name := self.params.source_name: + if source_name not in self.sources: + fmt.warning( + f"Source or resouce with name: {source_name} has not been found in pipeline" + " script" + ) + raise PreflightError() + if self.params.current_dir != self.params.pipeline_workdir: fmt.warning( "Current working directory is different from the " @@ -122,10 +141,14 @@ def preflight_checks(self) -> None: def check_if_runnable(self) -> None: if not self.pipelines: - raise RunnerError(f"No pipelines found in {self.params.script_path}") + raise RunnerError( + f"No pipeline instances found in pipeline script {self.params.script_path}" + ) if not self.sources: - raise RunnerError(f"Could not find any source or resource {self.params.script_path}") + raise RunnerError( + f"No source or resources found in pipeline script {self.params.script_path}" + ) def has_dlt_config(self, path: str) -> bool: return os.path.exists(os.path.join(path, ".dlt")) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 2610d7692d..89bed0883c 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -101,8 +101,10 @@ def pipeline_members(self) -> PipelineMembers: members["pipelines"][value.pipeline_name] = value # take source and resources by their variable name + # and also we would like to map by resource and source name if isinstance(value, (DltResource, DltSource)): members["sources"][name] = value + members["sources"][value.name] = value return members diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 5112c7a1f4..3dcd655830 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -18,8 +18,6 @@ def __init__(self, params: RunnerParams) -> None: self.inquirer = Inquirer(self.params, self.script.pipeline_members) def __enter__(self) -> Self: - self.inquirer.preflight_checks() - self.inquirer.check_if_runnable() self.pipeline, self.resource = self.inquirer.maybe_ask() return self From bb608ef57e125eea19c3c1722ca807f89d67b9d0 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 11:38:52 +0100 Subject: [PATCH 27/59] Add dlt specific resource, source and pipeline alias by variable names --- dlt/common/cli/runner/inquirer.py | 64 ++++++++++++++---------- dlt/common/cli/runner/pipeline_script.py | 15 ++++-- dlt/common/cli/runner/types.py | 1 + 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index e1f86112ee..656debe422 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -40,7 +40,7 @@ def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> N self.params = params self.pipelines = pipeline_members.get("pipelines") self.sources = pipeline_members.get("sources") - self.preflight_checks() + self.aliases = pipeline_members.get("aliases") def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """Shows prompts to select pipeline, resources and sources @@ -48,21 +48,19 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: Returns: (DltResource, DltSource): a pair of pipeline and resources and sources """ - # save first pipeline and source - pipeline_name, _ = next(iter(self.pipelines.items())) - source_name, _ = next(iter(self.sources.items())) - if self.params.pipeline_name: - pipeline_name = self.params.pipeline_name - elif len(self.pipelines) > 1: - pipeline_name = self.ask_for_pipeline() + self.preflight_checks() + pipeline_name = self.get_pipeline_name() + if pipeline_name in self.pipelines: + pipeline = self.pipelines[pipeline_name] + else: + pipeline = self.aliases[pipeline_name] - if self.params.pipeline_name: - source_name = self.params.pipeline_name - elif len(self.sources) > 1: - source_name = self.ask_for_source() + source_name = self.get_source_name() + if source_name in self.sources: + resource = self.sources[source_name] + else: + resource = self.aliases[source_name] - pipeline = self.pipelines[pipeline_name] - resource = self.sources[source_name] fmt.echo("Pipeline: " + fmt.style(pipeline_name, fg="blue", underline=True)) if isinstance(resource, DltResource): @@ -73,35 +71,47 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) return pipeline, resource - def ask_for_pipeline(self) -> str: + def get_pipeline_name(self) -> str: + if self.params.pipeline_name: + return self.params.pipeline_name + if len(self.pipelines) > 1: message, options, values = make_select_options("pipeline", self.pipelines) - - choice = fmt.prompt(message, options, default="n") - if choice == "n": - raise FriendlyExit() - + choice = self.ask(message, options, default="n") pipeline_name = values[int(choice)] return pipeline_name - def ask_for_source(self) -> str: - message, options, values = make_select_options("resource or source", self.sources) + pipeline_name, _ = next(iter(self.pipelines.items())) + return pipeline_name + + def get_source_name(self) -> str: + if self.params.source_name: + return self.params.source_name + + if len(self.sources) > 1: + message, options, values = make_select_options("resource or source", self.sources) + choice = self.ask(message, options, default="n") + source_name = values[int(choice)] + return source_name - choice = fmt.prompt(message, options, default="n") + source_name, _ = next(iter(self.sources.items())) + return source_name + + def ask(self, message: str, options: t.List[str], default: t.Optional[str] = None) -> str: + choice = fmt.prompt(message, options, default=default) if choice == "n": raise FriendlyExit() - source_name = values[int(choice)] - return source_name + return choice def preflight_checks(self) -> None: if pipeline_name := self.params.pipeline_name: - if pipeline_name not in self.pipelines: + if pipeline_name not in self.pipelines and pipeline_name not in self.aliases: fmt.warning(f"Pipeline {pipeline_name} has not been found in pipeline script") raise PreflightError() if source_name := self.params.source_name: - if source_name not in self.sources: + if source_name not in self.sources and source_name not in self.aliases: fmt.warning( f"Source or resouce with name: {source_name} has not been found in pipeline" " script" diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 89bed0883c..57e47fe217 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -89,21 +89,26 @@ def module_name(self) -> str: @property def pipeline_members(self) -> PipelineMembers: - """Inspect the module and return pipelines with resources and sources""" + """Inspect the module and return pipelines with resources and sources + We populate sources, pipelines with relevant instances by their name, + also put every dlt specific instance under "by_alias" collection. + + It is done because users might pass pipeline, source or resource name + by their variable names, so this is an extra step to make sure that we + locate corrent pipeline, source or resource instance before showing an error. + """ members: PipelineMembers = defaultdict(dict) # type: ignore[assignment] for name, value in self.pipeline_module.__dict__.items(): # skip modules and private stuff if isinstance(value, ModuleType) or name.startswith("_"): continue - # take pipelines by their names if isinstance(value, dlt.Pipeline): + members["by_alias"][name] = value members["pipelines"][value.pipeline_name] = value - # take source and resources by their variable name - # and also we would like to map by resource and source name if isinstance(value, (DltResource, DltSource)): - members["sources"][name] = value + members["by_alias"][name] = value members["sources"][value.name] = value return members diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py index df008085a0..ee27f98810 100644 --- a/dlt/common/cli/runner/types.py +++ b/dlt/common/cli/runner/types.py @@ -20,3 +20,4 @@ class RunnerParams: class PipelineMembers(t.TypedDict): pipelines: t.Dict[str, dlt.Pipeline] sources: t.Dict[str, t.Union[DltResource, DltSource]] + aliases: t.Dict[str, t.Union[DltResource, DltSource, dlt.Pipeline]] From 9a1845aac4e09494590d45f2b717e477d976865b Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 11:51:01 +0100 Subject: [PATCH 28/59] Show pipeline name and source or resource name in case they were passed by variable name --- dlt/common/cli/runner/inquirer.py | 13 +++++++++++-- dlt/common/cli/runner/pipeline_script.py | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 656debe422..6daed62f20 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -50,25 +50,34 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """ self.preflight_checks() pipeline_name = self.get_pipeline_name() + pipeline_alias_message = "" if pipeline_name in self.pipelines: pipeline = self.pipelines[pipeline_name] else: pipeline = self.aliases[pipeline_name] + pipeline_alias_message = f" ({pipeline.pipeline_name})" source_name = self.get_source_name() + source_alias_message = "" if source_name in self.sources: resource = self.sources[source_name] else: resource = self.aliases[source_name] + source_alias_message = f" ({resource.name})" - fmt.echo("Pipeline: " + fmt.style(pipeline_name, fg="blue", underline=True)) + fmt.echo( + "Pipeline: " + + fmt.style(pipeline_name + pipeline_alias_message, fg="blue", underline=True) + ) if isinstance(resource, DltResource): label = "Resource" else: label = "Source" - fmt.echo(f"{label}: " + fmt.style(source_name, fg="blue", underline=True)) + fmt.echo( + f"{label}: " + fmt.style(source_name + source_alias_message, fg="blue", underline=True) + ) return pipeline, resource def get_pipeline_name(self) -> str: diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 57e47fe217..61d096904e 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -104,11 +104,11 @@ def pipeline_members(self) -> PipelineMembers: continue if isinstance(value, dlt.Pipeline): - members["by_alias"][name] = value + members["aliases"][name] = value members["pipelines"][value.pipeline_name] = value if isinstance(value, (DltResource, DltSource)): - members["by_alias"][name] = value + members["aliases"][name] = value members["sources"][value.name] = value return members From 11529b0fd5abc1f2b0c454b90a9dca2540fd4899 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 12:04:14 +0100 Subject: [PATCH 29/59] Fix mypy issues --- dlt/common/cli/runner/inquirer.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 6daed62f20..69e6ed0068 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -50,24 +50,23 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """ self.preflight_checks() pipeline_name = self.get_pipeline_name() - pipeline_alias_message = "" + real_pipeline_name = "" if pipeline_name in self.pipelines: pipeline = self.pipelines[pipeline_name] else: - pipeline = self.aliases[pipeline_name] - pipeline_alias_message = f" ({pipeline.pipeline_name})" + pipeline = t.cast(dlt.Pipeline, self.aliases[pipeline_name]) + real_pipeline_name = f" ({pipeline.pipeline_name})" source_name = self.get_source_name() - source_alias_message = "" + real_source_name = "" if source_name in self.sources: resource = self.sources[source_name] else: - resource = self.aliases[source_name] - source_alias_message = f" ({resource.name})" + resource = self.aliases[source_name] # type: ignore + real_source_name = f" ({resource.name})" fmt.echo( - "Pipeline: " - + fmt.style(pipeline_name + pipeline_alias_message, fg="blue", underline=True) + "Pipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) ) if isinstance(resource, DltResource): @@ -76,7 +75,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: label = "Source" fmt.echo( - f"{label}: " + fmt.style(source_name + source_alias_message, fg="blue", underline=True) + f"{label}: " + fmt.style(source_name + real_source_name, fg="blue", underline=True) ) return pipeline, resource @@ -87,7 +86,7 @@ def get_pipeline_name(self) -> str: if len(self.pipelines) > 1: message, options, values = make_select_options("pipeline", self.pipelines) choice = self.ask(message, options, default="n") - pipeline_name = values[int(choice)] + pipeline_name = values[choice] return pipeline_name pipeline_name, _ = next(iter(self.pipelines.items())) @@ -100,18 +99,18 @@ def get_source_name(self) -> str: if len(self.sources) > 1: message, options, values = make_select_options("resource or source", self.sources) choice = self.ask(message, options, default="n") - source_name = values[int(choice)] + source_name = values[choice] return source_name source_name, _ = next(iter(self.sources.items())) return source_name - def ask(self, message: str, options: t.List[str], default: t.Optional[str] = None) -> str: + def ask(self, message: str, options: t.List[str], default: t.Optional[str] = None) -> int: choice = fmt.prompt(message, options, default=default) if choice == "n": raise FriendlyExit() - return choice + return int(choice) def preflight_checks(self) -> None: if pipeline_name := self.params.pipeline_name: From 38aef786a068356aed490de4c189fb277c2a6d00 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 12:10:00 +0100 Subject: [PATCH 30/59] Adjust error messages --- dlt/common/cli/runner/inquirer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 69e6ed0068..895c8d3cbd 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -115,15 +115,17 @@ def ask(self, message: str, options: t.List[str], default: t.Optional[str] = Non def preflight_checks(self) -> None: if pipeline_name := self.params.pipeline_name: if pipeline_name not in self.pipelines and pipeline_name not in self.aliases: - fmt.warning(f"Pipeline {pipeline_name} has not been found in pipeline script") + fmt.error(f"Pipeline {pipeline_name} has not been found in pipeline script") + fmt.error("You can choose one of: " + ", ".join(self.pipelines.keys())) raise PreflightError() if source_name := self.params.source_name: if source_name not in self.sources and source_name not in self.aliases: - fmt.warning( + fmt.error( f"Source or resouce with name: {source_name} has not been found in pipeline" " script" ) + fmt.error("You can choose one of: " + ", ".join(self.sources.keys())) raise PreflightError() if self.params.current_dir != self.params.pipeline_workdir: From 666879da231fa26e26ced9caaf0adc4f929ed88d Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:22:11 +0100 Subject: [PATCH 31/59] Add fmt.info_style helper --- dlt/cli/echo.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index 1961d7b561..4b603d679c 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -29,6 +29,10 @@ def bold(msg: str) -> str: return click.style(msg, bold=True, reset=True) +def info_style(msg: str) -> str: + return click.style(msg, fg="red", reset=True) + + def warning_style(msg: str) -> str: return click.style(msg, fg="yellow", reset=True) From 3ef0ed9d99109afdefb99be05c06552483ff2b77 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:22:32 +0100 Subject: [PATCH 32/59] Allow only bound resources and sources and simplify code --- dlt/common/cli/runner/inquirer.py | 31 ++++++++---------------- dlt/common/cli/runner/pipeline_script.py | 22 +++++++++-------- dlt/common/cli/runner/runner.py | 7 +----- dlt/common/cli/runner/types.py | 1 - 4 files changed, 23 insertions(+), 38 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 895c8d3cbd..4932b22825 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -40,7 +40,6 @@ def __init__(self, params: RunnerParams, pipeline_members: PipelineMembers) -> N self.params = params self.pipelines = pipeline_members.get("pipelines") self.sources = pipeline_members.get("sources") - self.aliases = pipeline_members.get("aliases") def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """Shows prompts to select pipeline, resources and sources @@ -50,30 +49,22 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: """ self.preflight_checks() pipeline_name = self.get_pipeline_name() - real_pipeline_name = "" - if pipeline_name in self.pipelines: - pipeline = self.pipelines[pipeline_name] - else: - pipeline = t.cast(dlt.Pipeline, self.aliases[pipeline_name]) - real_pipeline_name = f" ({pipeline.pipeline_name})" + pipeline = self.pipelines[pipeline_name] + real_pipeline_name = f" ({pipeline.pipeline_name})" source_name = self.get_source_name() - real_source_name = "" - if source_name in self.sources: - resource = self.sources[source_name] + resource = self.sources[source_name] + if isinstance(resource, DltResource): + label = "Resource" else: - resource = self.aliases[source_name] # type: ignore - real_source_name = f" ({resource.name})" + label = "Source" + + real_source_name = f" ({label}: {resource.name})" fmt.echo( "Pipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) ) - if isinstance(resource, DltResource): - label = "Resource" - else: - label = "Source" - fmt.echo( f"{label}: " + fmt.style(source_name + real_source_name, fg="blue", underline=True) ) @@ -141,10 +132,8 @@ def preflight_checks(self) -> None: if has_cwd_config and has_pipeline_config: message = tw.dedent( f""" - Found {dot_dlt} in current directory and pipeline directory if you intended to - use {self.params.pipeline_workdir}/{dot_dlt}, please change your current directory. - - Using {dot_dlt} in current directory {self.params.current_dir}/{dot_dlt}. + Using {dot_dlt} in current directory {self.params.current_dir}/{dot_dlt}, if you intended to use + {self.params.pipeline_workdir}/{dot_dlt}, please change your current directory. """, ) fmt.echo(fmt.warning_style(message)) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 61d096904e..1fd72db605 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -9,6 +9,7 @@ import dlt +from dlt.cli import echo as fmt from dlt.cli.utils import parse_init_script from dlt.common.cli.runner.source_patcher import SourcePatcher from dlt.common.cli.runner.types import PipelineMembers, RunnerParams @@ -90,12 +91,9 @@ def module_name(self) -> str: @property def pipeline_members(self) -> PipelineMembers: """Inspect the module and return pipelines with resources and sources - We populate sources, pipelines with relevant instances by their name, - also put every dlt specific instance under "by_alias" collection. + We populate sources, pipelines with relevant instances by their variable name. - It is done because users might pass pipeline, source or resource name - by their variable names, so this is an extra step to make sure that we - locate corrent pipeline, source or resource instance before showing an error. + Resources and sources must be initialized and bound beforehand. """ members: PipelineMembers = defaultdict(dict) # type: ignore[assignment] for name, value in self.pipeline_module.__dict__.items(): @@ -104,12 +102,16 @@ def pipeline_members(self) -> PipelineMembers: continue if isinstance(value, dlt.Pipeline): - members["aliases"][name] = value - members["pipelines"][value.pipeline_name] = value + members["pipelines"][name] = value - if isinstance(value, (DltResource, DltSource)): - members["aliases"][name] = value - members["sources"][value.name] = value + if isinstance(value, DltSource): + members["sources"][name] = value + + if isinstance(value, DltResource): + if value._args_bound: + members["sources"][name] = value + else: + fmt.echo(fmt.info_style(f"Resource: {value.name} is not bound, skipping.")) return members diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 3dcd655830..d6a03253dc 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -7,7 +7,6 @@ from dlt.common.pipeline import LoadInfo from dlt.common.cli.runner.pipeline_script import PipelineScript from dlt.common.cli.runner.types import RunnerParams -from dlt.extract.resource import DltResource class PipelineRunner: @@ -30,9 +29,5 @@ def __exit__( pass def run(self) -> LoadInfo: - data_resource = self.resource - if isinstance(self.resource, DltResource) and not self.resource._args_bound: - data_resource = self.resource() - - load_info = self.pipeline.run(data_resource, **self.script.run_arguments) # type: ignore[arg-type] + load_info = self.pipeline.run(self.resource, **self.script.run_arguments) # type: ignore[arg-type] return load_info diff --git a/dlt/common/cli/runner/types.py b/dlt/common/cli/runner/types.py index ee27f98810..df008085a0 100644 --- a/dlt/common/cli/runner/types.py +++ b/dlt/common/cli/runner/types.py @@ -20,4 +20,3 @@ class RunnerParams: class PipelineMembers(t.TypedDict): pipelines: t.Dict[str, dlt.Pipeline] sources: t.Dict[str, t.Union[DltResource, DltSource]] - aliases: t.Dict[str, t.Union[DltResource, DltSource, dlt.Pipeline]] From 68873973478848d5bbc20879031c7e9b02d35a4d Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:23:59 +0100 Subject: [PATCH 33/59] Format code --- dlt/cli/echo.py | 2 +- dlt/common/cli/runner/inquirer.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index 4b603d679c..1657ab501d 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -30,7 +30,7 @@ def bold(msg: str) -> str: def info_style(msg: str) -> str: - return click.style(msg, fg="red", reset=True) + return click.style(msg, fg="blue", reset=True) def warning_style(msg: str) -> str: diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 4932b22825..6dd624df69 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -120,9 +120,11 @@ def preflight_checks(self) -> None: raise PreflightError() if self.params.current_dir != self.params.pipeline_workdir: - fmt.warning( - "Current working directory is different from the " - f"pipeline script {self.params.pipeline_workdir}\n" + fmt.echo( + fmt.warning_style( + "Current working directory is different from the " + f"pipeline script {self.params.pipeline_workdir}\n" + ) ) fmt.echo(f"Current workdir: {fmt.style(self.params.current_dir, fg='blue')}") fmt.echo(f"Pipeline workdir: {fmt.style(self.params.pipeline_workdir, fg='blue')}") From 381a2653a7b5dc28ca3611f7729307dfe34bf3a8 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:28:15 +0100 Subject: [PATCH 34/59] Adjust user messages --- dlt/common/cli/runner/inquirer.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 6dd624df69..f19f881f80 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -50,7 +50,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: self.preflight_checks() pipeline_name = self.get_pipeline_name() pipeline = self.pipelines[pipeline_name] - real_pipeline_name = f" ({pipeline.pipeline_name})" + real_pipeline_name = f" (Pipeline: {pipeline.pipeline_name})" source_name = self.get_source_name() resource = self.sources[source_name] @@ -62,7 +62,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: real_source_name = f" ({label}: {resource.name})" fmt.echo( - "Pipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) + "\nPipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) ) fmt.echo( @@ -120,23 +120,22 @@ def preflight_checks(self) -> None: raise PreflightError() if self.params.current_dir != self.params.pipeline_workdir: + fmt.echo(f"\nCurrent workdir: {fmt.style(self.params.current_dir, fg='blue')}") + fmt.echo(f"Pipeline workdir: {fmt.style(self.params.pipeline_workdir, fg='blue')}\n") + fmt.echo( fmt.warning_style( "Current working directory is different from the " - f"pipeline script {self.params.pipeline_workdir}\n" + f"pipeline script {self.params.pipeline_workdir}" ) ) - fmt.echo(f"Current workdir: {fmt.style(self.params.current_dir, fg='blue')}") - fmt.echo(f"Pipeline workdir: {fmt.style(self.params.pipeline_workdir, fg='blue')}") has_cwd_config = self.has_dlt_config(self.params.current_dir) has_pipeline_config = self.has_dlt_config(self.params.pipeline_workdir) if has_cwd_config and has_pipeline_config: message = tw.dedent( - f""" - Using {dot_dlt} in current directory {self.params.current_dir}/{dot_dlt}, if you intended to use - {self.params.pipeline_workdir}/{dot_dlt}, please change your current directory. - """, + f"""Using {dot_dlt} in current directory, if you intended to use """ + f"""{self.params.pipeline_workdir}/{dot_dlt}, please change your current directory.""", ) fmt.echo(fmt.warning_style(message)) elif not has_cwd_config and has_pipeline_config: From 06f3cf01afd5def9c322e36623cee7e95335982d Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:28:22 +0100 Subject: [PATCH 35/59] Format code --- dlt/common/cli/runner/inquirer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index f19f881f80..37b12a7f3c 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -62,7 +62,8 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: real_source_name = f" ({label}: {resource.name})" fmt.echo( - "\nPipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) + "\nPipeline: " + + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) ) fmt.echo( From 29b5ec9281b3bb2cc88ce4d2320872b77e0e41e2 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 13:29:34 +0100 Subject: [PATCH 36/59] Cleanup code --- dlt/common/cli/runner/inquirer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 37b12a7f3c..ac597b7b00 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -106,13 +106,13 @@ def ask(self, message: str, options: t.List[str], default: t.Optional[str] = Non def preflight_checks(self) -> None: if pipeline_name := self.params.pipeline_name: - if pipeline_name not in self.pipelines and pipeline_name not in self.aliases: + if pipeline_name not in self.pipelines: fmt.error(f"Pipeline {pipeline_name} has not been found in pipeline script") fmt.error("You can choose one of: " + ", ".join(self.pipelines.keys())) raise PreflightError() if source_name := self.params.source_name: - if source_name not in self.sources and source_name not in self.aliases: + if source_name not in self.sources: fmt.error( f"Source or resouce with name: {source_name} has not been found in pipeline" " script" From ed1fadce7021275f6e1cb3acc6493c131f623158 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 15:14:22 +0100 Subject: [PATCH 37/59] Strip all load info access for pipeline.run calls --- dlt/common/cli/runner/source_patcher.py | 33 ++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py index 03465a3dcf..1dc3694755 100644 --- a/dlt/common/cli/runner/source_patcher.py +++ b/dlt/common/cli/runner/source_patcher.py @@ -21,11 +21,15 @@ def patch(self) -> str: # Copy original source script_lines: t.List[str] = self.visitor.source_lines[:] stub = '""" run method replaced """' - + # cases where load_info = pipeline.run(...) + # then load_info being accesses down the line + # we need to strip them as well + load_info_variables: t.List[int] = [] for node in run_nodes: # if it is a one liner if node.lineno == node.end_lineno: - script_lines[node.lineno] = None + line_of_code = script_lines[node.lineno - 1] + script_lines[node.lineno - 1] = None else: line_of_code = script_lines[node.lineno - 1] script_lines[node.lineno - 1] = self.restore_indent(line_of_code) + stub @@ -34,8 +38,29 @@ def patch(self) -> str: script_lines[start - 1] = None start += 1 - result = [line.rstrip() for line in script_lines if line] - return "\n".join(result) + if variable := self.get_load_info_variable_name(line_of_code): + load_info_variables.append(variable) + + # FIXME: properly parse AST tree and eliminate all load_info assignments + result = [] + for line in script_lines: + if line and line.strip(): + for variable in load_info_variables: + if variable in line: + line = None + + if line: + result.append(line) + + code = "\n".join(result) + return code + + def get_load_info_variable_name(self, line: str) -> t.Optional[str]: + if "=" in line: + variable, _ = line.split("=", maxsplit=2) + return variable.strip() + + return None def restore_indent(self, line: str) -> str: indent = "" From 2efc6c264f215fbd661a7215f79de5ad28a8500b Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 15:14:39 +0100 Subject: [PATCH 38/59] Adjust test pipeline --- tests/cli/cases/cli_runner/pipeline.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index f9c0a2be37..f822ca6b4f 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -7,7 +7,9 @@ def squares_resource(): yield {"id": idx, "square": idx * idx} -if __name__ == "__main__": - p = dlt.pipeline(pipeline_name="dummy_pipeline", destination="dummy") - load_info = p.run(squares_resource) - print(load_info) +squares_resource_instance = squares_resource() + +squares_pipeline = dlt.pipeline(pipeline_name="numbers_pipeline", destination="duckdb") +load_info = squares_pipeline.run(squares_resource) + +print(load_info) From 7b48ed442c371f3f47d190485a2470049705b52a Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 15:15:23 +0100 Subject: [PATCH 39/59] Adjust messages printed to users --- dlt/common/cli/runner/inquirer.py | 35 ++++++++++++++++-------- dlt/common/cli/runner/pipeline_script.py | 7 +++-- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index ac597b7b00..974b84bdb8 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -107,17 +107,24 @@ def ask(self, message: str, options: t.List[str], default: t.Optional[str] = Non def preflight_checks(self) -> None: if pipeline_name := self.params.pipeline_name: if pipeline_name not in self.pipelines: - fmt.error(f"Pipeline {pipeline_name} has not been found in pipeline script") - fmt.error("You can choose one of: " + ", ".join(self.pipelines.keys())) + fmt.echo( + fmt.error_style( + f"Pipeline {pipeline_name} has not been found in pipeline script" + "You can choose one of: " + + ", ".join(self.pipelines.keys()) + ) + ) raise PreflightError() if source_name := self.params.source_name: if source_name not in self.sources: - fmt.error( - f"Source or resouce with name: {source_name} has not been found in pipeline" - " script" + fmt.echo( + fmt.error_style( + f"Source or resouce with name: {source_name} has not been found in pipeline" + " script. \n You can choose one of: " + + ", ".join(self.sources.keys()) + ) ) - fmt.error("You can choose one of: " + ", ".join(self.sources.keys())) raise PreflightError() if self.params.current_dir != self.params.pipeline_workdir: @@ -140,13 +147,17 @@ def preflight_checks(self) -> None: ) fmt.echo(fmt.warning_style(message)) elif not has_cwd_config and has_pipeline_config: - fmt.error( - f"{dot_dlt} is missing in current directory but exists in pipeline script's" - " directory" + fmt.echo( + fmt.error_style( + f"{dot_dlt} is missing in current directory but exists in pipeline script's" + " directory" + ) ) - fmt.info( - f"Please change your current directory to {self.params.pipeline_workdir} and" - " try again" + fmt.echo( + fmt.info_style( + "Please change your current directory to" + f" {self.params.pipeline_workdir} and try again" + ) ) raise PreflightError() diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 1fd72db605..9f4b8a07a7 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -4,7 +4,6 @@ import typing as t import sys -from collections import defaultdict from types import ModuleType import dlt @@ -95,7 +94,10 @@ def pipeline_members(self) -> PipelineMembers: Resources and sources must be initialized and bound beforehand. """ - members: PipelineMembers = defaultdict(dict) # type: ignore[assignment] + members: PipelineMembers = { + "pipelines": {}, + "sources": {}, + } for name, value in self.pipeline_module.__dict__.items(): # skip modules and private stuff if isinstance(value, ModuleType) or name.startswith("_"): @@ -112,7 +114,6 @@ def pipeline_members(self) -> PipelineMembers: members["sources"][name] = value else: fmt.echo(fmt.info_style(f"Resource: {value.name} is not bound, skipping.")) - return members @property From 7b5dabde0ff0bdca7425161d0711521b7b17508e Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 15:15:32 +0100 Subject: [PATCH 40/59] Update tests --- tests/cli/test_run_command.py | 40 ++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 665a87e8b7..eae6348405 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -14,17 +14,47 @@ from tests.utils import TESTS_ROOT RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" -DUMMY_PIPELINE = RUNNER_PIPELINES / "pipeline.py" +TEST_PIPELINE = RUNNER_PIPELINES / "pipeline.py" def test_run_command_requires_working_directory_same_as_pipeline_working_directory(tmp_path): with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( - str(DUMMY_PIPELINE), - "dummy_pipeline", - "squares_resource", + str(TEST_PIPELINE), + "squares_pipeline", + "squares_resource_instance", ["write_disposition=merge", "loader_file_format=jsonl"], ) output = buf.getvalue() - assert "WARNING: Current working directory is different from the pipeline script" in output + assert "Current working directory is different from the pipeline script" in output + assert "Please change your current directory to" in output + + +def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(tmp_path): + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + run_command.run_pipeline_command( + str(TEST_PIPELINE), + "pipeline_404", + "squares_resource_instance", + ["write_disposition=merge", "loader_file_format=jsonl"], + ) + + output = buf.getvalue() + assert "Pipeline pipeline_404 has not been found in pipeline script" in output + assert "You can choose one of: squares_pipeline" in output + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + run_command.run_pipeline_command( + str(TEST_PIPELINE), + "squares_pipeline", + "resource_404", + ["write_disposition=merge", "loader_file_format=jsonl"], + ) + + output = buf.getvalue() + assert ( + "Source or resouce with name: resource_404 has not been found in pipeline script." + in output + ) + assert "You can choose one of: squares_resource_instance" in output From 2c972f06079aec310d4778660a7783cd5a2353e7 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 15:16:33 +0100 Subject: [PATCH 41/59] Fix mypy issues --- dlt/common/cli/runner/source_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py index 1dc3694755..60d0c5952a 100644 --- a/dlt/common/cli/runner/source_patcher.py +++ b/dlt/common/cli/runner/source_patcher.py @@ -24,7 +24,7 @@ def patch(self) -> str: # cases where load_info = pipeline.run(...) # then load_info being accesses down the line # we need to strip them as well - load_info_variables: t.List[int] = [] + load_info_variables: t.List[str] = [] for node in run_nodes: # if it is a one liner if node.lineno == node.end_lineno: From 2d2807b4527de87b9801701c11f0242b9188a39c Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 16:49:28 +0100 Subject: [PATCH 42/59] Add more tests --- dlt/common/cli/runner/inquirer.py | 11 +--- dlt/common/cli/runner/runner.py | 2 +- dlt/common/cli/runner/source_patcher.py | 25 ++++++-- tests/cli/cases/cli_runner/pipeline.py | 12 +++- tests/cli/test_run_command.py | 83 +++++++++++++++++++++---- 5 files changed, 107 insertions(+), 26 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 974b84bdb8..88470ff2b2 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -61,14 +61,9 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: real_source_name = f" ({label}: {resource.name})" - fmt.echo( - "\nPipeline: " - + fmt.style(pipeline_name + real_pipeline_name, fg="blue", underline=True) - ) - - fmt.echo( - f"{label}: " + fmt.style(source_name + real_source_name, fg="blue", underline=True) - ) + fmt.echo("\nPipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue")) + + fmt.echo(f"{label}: " + fmt.style(source_name + real_source_name, fg="blue")) return pipeline, resource def get_pipeline_name(self) -> str: diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index d6a03253dc..476ec0393c 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -3,8 +3,8 @@ from typing_extensions import Self from types import TracebackType -from dlt.common.cli.runner.inquirer import Inquirer from dlt.common.pipeline import LoadInfo +from dlt.common.cli.runner.inquirer import Inquirer from dlt.common.cli.runner.pipeline_script import PipelineScript from dlt.common.cli.runner.types import RunnerParams diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py index 60d0c5952a..a6a125cb64 100644 --- a/dlt/common/cli/runner/source_patcher.py +++ b/dlt/common/cli/runner/source_patcher.py @@ -43,11 +43,14 @@ def patch(self) -> str: # FIXME: properly parse AST tree and eliminate all load_info assignments result = [] - for line in script_lines: - if line and line.strip(): - for variable in load_info_variables: - if variable in line: - line = None + for line in self.without_empty_lines(script_lines): + if line is None: + continue + + for variable in load_info_variables: + if variable in str(line): + line = None + continue if line: result.append(line) @@ -62,6 +65,18 @@ def get_load_info_variable_name(self, line: str) -> t.Optional[str]: return None + def without_empty_lines(self, script_lines: t.List[str]) -> t.List[str]: + lines = [] + for line in script_lines: + if not line: + continue + if not line.strip(): + continue + + lines.append(line) + + return lines + def restore_indent(self, line: str) -> str: indent = "" for ch in line: diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index f822ca6b4f..ec1b14150d 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -1,15 +1,25 @@ import dlt +@dlt.resource +def quads_resource(): + for idx in range(10): + yield {"id": idx, "quad": idx**4} + + @dlt.resource def squares_resource(): for idx in range(10): yield {"id": idx, "square": idx * idx} +quads_resource_instance = quads_resource() squares_resource_instance = squares_resource() +quads_pipeline = dlt.pipeline(pipeline_name="numbers_quadruples_pipeline", destination="duckdb") squares_pipeline = dlt.pipeline(pipeline_name="numbers_pipeline", destination="duckdb") -load_info = squares_pipeline.run(squares_resource) + +load_info = squares_pipeline.run(quads_resource()) +load_info_2 = squares_pipeline.run(squares_resource) print(load_info) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index eae6348405..3db82a2a28 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -1,23 +1,34 @@ import io import os import contextlib -import pytest -import logging -from subprocess import CalledProcessError -import dlt -from dlt.common.runners.venv import Venv -from dlt.common.storages.file_storage import FileStorage +from unittest import mock + +import pytest -from dlt.cli import echo, run_command +from dlt.cli import run_command +from dlt.cli.utils import parse_init_script +from dlt.common.cli.runner.inquirer import Inquirer +from dlt.common.cli.runner.runner import PipelineRunner +from dlt.common.cli.runner.source_patcher import SourcePatcher +from dlt.common.cli.runner.types import PipelineMembers from tests.utils import TESTS_ROOT RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" TEST_PIPELINE = RUNNER_PIPELINES / "pipeline.py" +TEST_PIPELINE_CONTENTS = open(RUNNER_PIPELINES / "pipeline.py").read().strip() -def test_run_command_requires_working_directory_same_as_pipeline_working_directory(tmp_path): +@pytest.fixture(scope="module") +def ch_pipeline_dir(): + cwd = os.getcwd() + os.chdir(RUNNER_PIPELINES) + yield + os.chdir(cwd) + + +def test_run_command_requires_working_directory_same_as_pipeline_working_directory(): with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( str(TEST_PIPELINE), @@ -31,7 +42,7 @@ def test_run_command_requires_working_directory_same_as_pipeline_working_directo assert "Please change your current directory to" in output -def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(tmp_path): +def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(): with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( str(TEST_PIPELINE), @@ -42,7 +53,7 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no output = buf.getvalue() assert "Pipeline pipeline_404 has not been found in pipeline script" in output - assert "You can choose one of: squares_pipeline" in output + assert "You can choose one of: quads_pipeline, squares_pipeline" in output with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( @@ -57,4 +68,54 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no "Source or resouce with name: resource_404 has not been found in pipeline script." in output ) - assert "You can choose one of: squares_resource_instance" in output + assert "You can choose one of: quads_resource_instance, squares_resource_instance" in output + + +def test_run_command_allows_selection_of_pipeline_source_or_resource(ch_pipeline_dir): + with mock.patch("dlt.common.cli.runner.inquirer.Inquirer.ask", return_value=0) as mocked_ask: + run_command.run_pipeline_command( + str(TEST_PIPELINE), + None, + None, + ["write_disposition=append", "loader_file_format=jsonl"], + ) + assert mocked_ask.call_count == 2 + + +expected_patched_code = """ +import dlt + +@dlt.resource + +def quads_resource(): + + for idx in range(10): + + yield {"id": idx, "quad": idx**4} + +@dlt.resource + +def squares_resource(): + + for idx in range(10): + + yield {"id": idx, "square": idx * idx} + +quads_resource_instance = quads_resource() + +squares_resource_instance = squares_resource() + +quads_pipeline = dlt.pipeline(pipeline_name="numbers_quadruples_pipeline", destination="duckdb") + +squares_pipeline = dlt.pipeline(pipeline_name="numbers_pipeline", destination="duckdb") +""".strip() + + +def test_pipeline_code_patcher(): + visitor = parse_init_script( + "run", + TEST_PIPELINE_CONTENTS, + "pipeline", + ) + patcher = SourcePatcher(visitor) + assert patcher.patch().strip() == expected_patched_code From 56e7e257cc83cf8305247437e58bdde0d1b3711a Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Fri, 22 Mar 2024 17:43:19 +0100 Subject: [PATCH 43/59] Allow running of pipeline when .dlt is missing --- dlt/common/cli/runner/inquirer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 88470ff2b2..f3edd91c04 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -150,11 +150,10 @@ def preflight_checks(self) -> None: ) fmt.echo( fmt.info_style( - "Please change your current directory to" + "If needed please change your current directory to" f" {self.params.pipeline_workdir} and try again" ) ) - raise PreflightError() def check_if_runnable(self) -> None: if not self.pipelines: From b7e3b26dcf633686b815c5309fb40adbcdd95579 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 10:53:20 +0100 Subject: [PATCH 44/59] Adjust pipeline --- tests/cli/cases/cli_runner/pipeline.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index ec1b14150d..fd6419b530 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -13,13 +13,27 @@ def squares_resource(): yield {"id": idx, "square": idx * idx} +@dlt.destination(loader_file_format="parquet") +def null_sink(_items, _table) -> None: + pass + + quads_resource_instance = quads_resource() squares_resource_instance = squares_resource() -quads_pipeline = dlt.pipeline(pipeline_name="numbers_quadruples_pipeline", destination="duckdb") -squares_pipeline = dlt.pipeline(pipeline_name="numbers_pipeline", destination="duckdb") +quads_pipeline = dlt.pipeline( + pipeline_name="numbers_quadruples_pipeline", + destination=null_sink, +) + +squares_pipeline = dlt.pipeline( + pipeline_name="numbers_pipeline", + destination="duckdb", +) -load_info = squares_pipeline.run(quads_resource()) -load_info_2 = squares_pipeline.run(squares_resource) +# load_info = squares_pipeline.run(quads_resource()) +# load_info_2 = squares_pipeline.run(squares_resource) -print(load_info) +# print(load_info) +load_info = quads_pipeline.run(quads_resource()) +print(load_info) \ No newline at end of file From 5f3aacf5af046258243e8d0930c8cb5262bc4784 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 10:53:29 +0100 Subject: [PATCH 45/59] Add more tests --- tests/cli/test_run_command.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 3db82a2a28..b57880d74e 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -4,15 +4,13 @@ from unittest import mock +import dlt import pytest from dlt.cli import run_command from dlt.cli.utils import parse_init_script -from dlt.common.cli.runner.inquirer import Inquirer -from dlt.common.cli.runner.runner import PipelineRunner from dlt.common.cli.runner.source_patcher import SourcePatcher -from dlt.common.cli.runner.types import PipelineMembers from tests.utils import TESTS_ROOT RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" @@ -39,7 +37,7 @@ def test_run_command_requires_working_directory_same_as_pipeline_working_directo output = buf.getvalue() assert "Current working directory is different from the pipeline script" in output - assert "Please change your current directory to" in output + assert "If needed please change your current directory to" in output def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(): @@ -71,7 +69,7 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no assert "You can choose one of: quads_resource_instance, squares_resource_instance" in output -def test_run_command_allows_selection_of_pipeline_source_or_resource(ch_pipeline_dir): +def test_run_command_allows_selection_of_pipeline_source_or_resource(): with mock.patch("dlt.common.cli.runner.inquirer.Inquirer.ask", return_value=0) as mocked_ask: run_command.run_pipeline_command( str(TEST_PIPELINE), From 2ab71252374d97ec7757f03aec8c6881ca972bad Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 14:44:12 +0100 Subject: [PATCH 46/59] Remove redundant file --- dlt/common/cli/runner/source_patcher.py | 87 ------------------------- 1 file changed, 87 deletions(-) delete mode 100644 dlt/common/cli/runner/source_patcher.py diff --git a/dlt/common/cli/runner/source_patcher.py b/dlt/common/cli/runner/source_patcher.py deleted file mode 100644 index a6a125cb64..0000000000 --- a/dlt/common/cli/runner/source_patcher.py +++ /dev/null @@ -1,87 +0,0 @@ -"""This module provides a utility to patch source code of pipeline scripts -to remove all `pipeline.run` calls to avoid any undesired side effects when -running the pipeline with given resources and sources -""" -import typing as t - -from dlt.reflection import names as rn -from dlt.reflection.script_visitor import PipelineScriptVisitor - - -class SourcePatcher: - def __init__(self, visitor: PipelineScriptVisitor) -> None: - self.visitor = visitor - - def patch(self) -> str: - """Removes all pipeline.run nodes and return patched source code""" - # Extract pipeline.run nodes - pipeline_runs = self.visitor.known_calls_with_nodes.get(rn.RUN) - run_nodes = [run_node for _args, run_node in pipeline_runs] - - # Copy original source - script_lines: t.List[str] = self.visitor.source_lines[:] - stub = '""" run method replaced """' - # cases where load_info = pipeline.run(...) - # then load_info being accesses down the line - # we need to strip them as well - load_info_variables: t.List[str] = [] - for node in run_nodes: - # if it is a one liner - if node.lineno == node.end_lineno: - line_of_code = script_lines[node.lineno - 1] - script_lines[node.lineno - 1] = None - else: - line_of_code = script_lines[node.lineno - 1] - script_lines[node.lineno - 1] = self.restore_indent(line_of_code) + stub - start = node.lineno + 1 - while start <= node.end_lineno: - script_lines[start - 1] = None - start += 1 - - if variable := self.get_load_info_variable_name(line_of_code): - load_info_variables.append(variable) - - # FIXME: properly parse AST tree and eliminate all load_info assignments - result = [] - for line in self.without_empty_lines(script_lines): - if line is None: - continue - - for variable in load_info_variables: - if variable in str(line): - line = None - continue - - if line: - result.append(line) - - code = "\n".join(result) - return code - - def get_load_info_variable_name(self, line: str) -> t.Optional[str]: - if "=" in line: - variable, _ = line.split("=", maxsplit=2) - return variable.strip() - - return None - - def without_empty_lines(self, script_lines: t.List[str]) -> t.List[str]: - lines = [] - for line in script_lines: - if not line: - continue - if not line.strip(): - continue - - lines.append(line) - - return lines - - def restore_indent(self, line: str) -> str: - indent = "" - for ch in line: - if ch != " ": - break - - indent += " " - return indent From b911408f99eafed16b4cfa7981fed518eb59d280 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 14:44:53 +0100 Subject: [PATCH 47/59] Monkey patch pipeline.run while loading the pipeline script --- dlt/common/cli/runner/pipeline_script.py | 51 +++++++++++++++++++----- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 9f4b8a07a7..db36ae371c 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -4,14 +4,16 @@ import typing as t import sys +from contextlib import contextmanager from types import ModuleType import dlt from dlt.cli import echo as fmt from dlt.cli.utils import parse_init_script -from dlt.common.cli.runner.source_patcher import SourcePatcher +from dlt.common.cli.runner.errors import RunnerError from dlt.common.cli.runner.types import PipelineMembers, RunnerParams +from dlt.common.pipeline import LoadInfo from dlt.sources import DltResource, DltSource @@ -41,22 +43,48 @@ def __init__(self, params: RunnerParams) -> None: self.workdir = os.path.dirname(os.path.abspath(params.script_path)) """Directory in which pipeline script lives""" + self.has_pipeline_auto_runs: bool = False + # Now we need to patch and store pipeline code visitor = parse_init_script( COMMAND_NAME, self.script_contents, self.module_name, ) - patcher = SourcePatcher(visitor) - self.source_code = patcher.patch() + self.source_code = visitor.source def load_module(self, script_path: str) -> ModuleType: """Loads pipeline module from a given location""" - spec = importlib.util.spec_from_file_location(self.module_name, script_path) - module = importlib.util.module_from_spec(spec) - sys.modules[self.module_name] = module - spec.loader.exec_module(module) - return module + with self.expect_no_pipeline_runs(): + spec = importlib.util.spec_from_file_location(self.module_name, script_path) + module = importlib.util.module_from_spec(spec) + sys.modules[self.module_name] = module + spec.loader.exec_module(module) + if self.has_pipeline_auto_runs: + raise RunnerError( + fmt.style( + "Please move all pipeline.run calls inside __main__ or remove them", + fg="red", + ) + ) + + return module + + @contextmanager + def expect_no_pipeline_runs(self): + """Monkey patch pipeline.run during module loading + Restore it once importing is done + """ + old_run = dlt.Pipeline.run + + def noop(*args, **kwargs) -> LoadInfo: + self.has_pipeline_auto_runs = True + + dlt.Pipeline.run = noop + + yield + + dlt.Pipeline.run = old_run @property def pipeline_module(self) -> ModuleType: @@ -113,7 +141,12 @@ def pipeline_members(self) -> PipelineMembers: if value._args_bound: members["sources"][name] = value else: - fmt.echo(fmt.info_style(f"Resource: {value.name} is not bound, skipping.")) + fmt.echo( + fmt.info_style( + f"Resource: {value.name} is not bound, skipping." + ) + ) + return members @property From 6f6577c55692c5f5f3daf7fc8b83f8b4c168c9bc Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 14:45:42 +0100 Subject: [PATCH 48/59] Add test pipeline script with immediate pipeline.run --- tests/cli/cases/cli_runner/pipeline.py | 8 +-- .../cli_runner/pipeline_with_immediate_run.py | 21 +++++++ tests/cli/test_run_command.py | 58 ++++--------------- 3 files changed, 35 insertions(+), 52 deletions(-) create mode 100644 tests/cli/cases/cli_runner/pipeline_with_immediate_run.py diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index fd6419b530..80d777f4f7 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -31,9 +31,5 @@ def null_sink(_items, _table) -> None: destination="duckdb", ) -# load_info = squares_pipeline.run(quads_resource()) -# load_info_2 = squares_pipeline.run(squares_resource) - -# print(load_info) -load_info = quads_pipeline.run(quads_resource()) -print(load_info) \ No newline at end of file +if __name__ == "__main__": + load_info = squares_pipeline.run(squares_resource(), schema=dlt.Schema("bobo-schema")) diff --git a/tests/cli/cases/cli_runner/pipeline_with_immediate_run.py b/tests/cli/cases/cli_runner/pipeline_with_immediate_run.py new file mode 100644 index 0000000000..50f3b5b6d3 --- /dev/null +++ b/tests/cli/cases/cli_runner/pipeline_with_immediate_run.py @@ -0,0 +1,21 @@ +import dlt + + +@dlt.resource +def numbers_resource(): + for idx in range(100): + yield {"id": idx, "num": idx} + + +@dlt.destination(loader_file_format="parquet") +def null_sink(_items, _table) -> None: + pass + + +numbers_pipeline = dlt.pipeline( + pipeline_name="numbers_pipeline", + destination=null_sink, +) + +load_info = numbers_pipeline.run() +print(load_info) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index b57880d74e..6177cf220d 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -4,17 +4,15 @@ from unittest import mock -import dlt import pytest from dlt.cli import run_command -from dlt.cli.utils import parse_init_script -from dlt.common.cli.runner.source_patcher import SourcePatcher from tests.utils import TESTS_ROOT RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" TEST_PIPELINE = RUNNER_PIPELINES / "pipeline.py" +TEST_PIPELINE_WITH_IMMEDIATE_RUN = RUNNER_PIPELINES / "pipeline_with_immediate_run.py" TEST_PIPELINE_CONTENTS = open(RUNNER_PIPELINES / "pipeline.py").read().strip() @@ -36,7 +34,9 @@ def test_run_command_requires_working_directory_same_as_pipeline_working_directo ) output = buf.getvalue() - assert "Current working directory is different from the pipeline script" in output + assert ( + "Current working directory is different from the pipeline script" in output + ) assert "If needed please change your current directory to" in output @@ -66,54 +66,20 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no "Source or resouce with name: resource_404 has not been found in pipeline script." in output ) - assert "You can choose one of: quads_resource_instance, squares_resource_instance" in output + assert ( + "You can choose one of: quads_resource_instance, squares_resource_instance" + in output + ) def test_run_command_allows_selection_of_pipeline_source_or_resource(): - with mock.patch("dlt.common.cli.runner.inquirer.Inquirer.ask", return_value=0) as mocked_ask: + with mock.patch( + "dlt.common.cli.runner.inquirer.Inquirer.ask", return_value=0 + ) as mocked_ask: run_command.run_pipeline_command( str(TEST_PIPELINE), None, None, - ["write_disposition=append", "loader_file_format=jsonl"], + ["write_disposition=append", "loader_file_format=parquet"], ) assert mocked_ask.call_count == 2 - - -expected_patched_code = """ -import dlt - -@dlt.resource - -def quads_resource(): - - for idx in range(10): - - yield {"id": idx, "quad": idx**4} - -@dlt.resource - -def squares_resource(): - - for idx in range(10): - - yield {"id": idx, "square": idx * idx} - -quads_resource_instance = quads_resource() - -squares_resource_instance = squares_resource() - -quads_pipeline = dlt.pipeline(pipeline_name="numbers_quadruples_pipeline", destination="duckdb") - -squares_pipeline = dlt.pipeline(pipeline_name="numbers_pipeline", destination="duckdb") -""".strip() - - -def test_pipeline_code_patcher(): - visitor = parse_init_script( - "run", - TEST_PIPELINE_CONTENTS, - "pipeline", - ) - patcher = SourcePatcher(visitor) - assert patcher.patch().strip() == expected_patched_code From a5a10c6f8beeb8206a8ea6dc98c99b0d81af0967 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 14:51:00 +0100 Subject: [PATCH 49/59] Show RunnerError using fmt.error_style --- dlt/cli/run_command.py | 2 +- dlt/common/cli/runner/pipeline_script.py | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index b2cca8fb03..4b884a8a66 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -44,7 +44,7 @@ def run_pipeline_command( except PipelineStepFailed: raise except RunnerError as ex: - fmt.echo(ex.message) + fmt.echo(fmt.error_style(ex.message)) return -1 except (FriendlyExit, PreflightError): fmt.info("Stopping...") diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index db36ae371c..d46071fb8f 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -62,10 +62,7 @@ def load_module(self, script_path: str) -> ModuleType: spec.loader.exec_module(module) if self.has_pipeline_auto_runs: raise RunnerError( - fmt.style( - "Please move all pipeline.run calls inside __main__ or remove them", - fg="red", - ) + "Please move all pipeline.run calls inside __main__ or remove them" ) return module @@ -141,11 +138,7 @@ def pipeline_members(self) -> PipelineMembers: if value._args_bound: members["sources"][name] = value else: - fmt.echo( - fmt.info_style( - f"Resource: {value.name} is not bound, skipping." - ) - ) + fmt.echo(fmt.info_style(f"Resource: {value.name} is not bound, skipping.")) return members From 5726391992fa36d6b7d8bbd1ce6a4e883910a129 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 15:01:16 +0100 Subject: [PATCH 50/59] Keep pytest output clean --- dlt/common/cli/runner/pipeline_script.py | 12 +---------- tests/cli/test_run_command.py | 27 +++++++++++++++++------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index d46071fb8f..9ef1943d4a 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -1,6 +1,5 @@ import os import importlib -import tempfile import typing as t import sys @@ -85,16 +84,7 @@ def noop(*args, **kwargs) -> LoadInfo: @property def pipeline_module(self) -> ModuleType: - with tempfile.NamedTemporaryFile( - mode="w+", - dir=self.workdir, - prefix="pipeline_", - suffix=".py", - ) as tm: - tm.write(self.source_code) - tm.flush() - self.module = self.load_module(tm.name) - + self.module = self.load_module(self.params.script_path) return self.module @property diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 6177cf220d..7b252171ad 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -34,9 +34,7 @@ def test_run_command_requires_working_directory_same_as_pipeline_working_directo ) output = buf.getvalue() - assert ( - "Current working directory is different from the pipeline script" in output - ) + assert "Current working directory is different from the pipeline script" in output assert "If needed please change your current directory to" in output @@ -66,20 +64,33 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no "Source or resouce with name: resource_404 has not been found in pipeline script." in output ) - assert ( - "You can choose one of: quads_resource_instance, squares_resource_instance" - in output - ) + assert "You can choose one of: quads_resource_instance, squares_resource_instance" in output def test_run_command_allows_selection_of_pipeline_source_or_resource(): with mock.patch( "dlt.common.cli.runner.inquirer.Inquirer.ask", return_value=0 - ) as mocked_ask: + ) as mocked_ask, io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( str(TEST_PIPELINE), None, None, ["write_disposition=append", "loader_file_format=parquet"], ) + + # expect 2 calls to Inquirer.ask + # first for pipeline selection + # second for reource or source assert mocked_ask.call_count == 2 + + +def test_run_command_exits_if_pipeline_run_calls_exist_at_the_top_level(): + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + run_command.run_pipeline_command( + str(TEST_PIPELINE_WITH_IMMEDIATE_RUN), + None, + None, + ["write_disposition=append", "loader_file_format=parquet"], + ) + output = buf.getvalue() + assert "Please move all pipeline.run calls inside __main__ or remove them" in output From dd3395cd065953b16e0d2aa5fc55d03b2515eb86 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 15:04:51 +0100 Subject: [PATCH 51/59] Fix mypy warning --- dlt/common/cli/runner/pipeline_script.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 9ef1943d4a..3f70a72c0b 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -67,20 +67,20 @@ def load_module(self, script_path: str) -> ModuleType: return module @contextmanager - def expect_no_pipeline_runs(self): + def expect_no_pipeline_runs(self): # type: ignore[no-untyped-def] """Monkey patch pipeline.run during module loading Restore it once importing is done """ old_run = dlt.Pipeline.run - def noop(*args, **kwargs) -> LoadInfo: + def noop(*args, **kwargs) -> LoadInfo: # type: ignore self.has_pipeline_auto_runs = True - dlt.Pipeline.run = noop + dlt.Pipeline.run = noop # type: ignore yield - dlt.Pipeline.run = old_run + dlt.Pipeline.run = old_run # type: ignore @property def pipeline_module(self) -> ModuleType: From 0385ba30a8020f6853e2ce59d3046aa386b85c78 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 15:09:25 +0100 Subject: [PATCH 52/59] Add more tests --- dlt/cli/run_command.py | 1 - tests/cli/test_run_command.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dlt/cli/run_command.py b/dlt/cli/run_command.py index 4b884a8a66..0575f4e39e 100644 --- a/dlt/cli/run_command.py +++ b/dlt/cli/run_command.py @@ -39,7 +39,6 @@ def run_pipeline_command( with PipelineRunner(params=params) as runner: load_info = runner.run() fmt.echo("") - # FIXME: provide more useful information fmt.echo(load_info) except PipelineStepFailed: raise diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 7b252171ad..1676aa2a3d 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -84,6 +84,21 @@ def test_run_command_allows_selection_of_pipeline_source_or_resource(): assert mocked_ask.call_count == 2 +def test_run_command_exits_if_exit_choice_selected(): + with mock.patch( + "dlt.common.cli.runner.inquirer.fmt.prompt", return_value="n" + ), io.StringIO() as buf, contextlib.redirect_stdout(buf): + run_command.run_pipeline_command( + str(TEST_PIPELINE), + None, + None, + ["write_disposition=append", "loader_file_format=parquet"], + ) + + output = buf.getvalue() + assert "Stopping..." in output + + def test_run_command_exits_if_pipeline_run_calls_exist_at_the_top_level(): with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( From 3ddd9fadf252a9651042b94c6aa3d276f2cb8ed4 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 15:48:15 +0100 Subject: [PATCH 53/59] Test happy path and try to attach to pipeline once runner exits --- dlt/common/cli/runner/inquirer.py | 9 ++++-- dlt/common/cli/runner/pipeline_script.py | 4 +-- tests/cli/cases/cli_runner/pipeline.py | 12 +++---- tests/cli/test_run_command.py | 40 +++++++++++++----------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index f3edd91c04..51165f6580 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -50,7 +50,6 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: self.preflight_checks() pipeline_name = self.get_pipeline_name() pipeline = self.pipelines[pipeline_name] - real_pipeline_name = f" (Pipeline: {pipeline.pipeline_name})" source_name = self.get_source_name() resource = self.sources[source_name] @@ -59,7 +58,13 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: else: label = "Source" - real_source_name = f" ({label}: {resource.name})" + real_source_name = "" + if resource.name != source_name: + real_source_name = f" ({resource.name})" + + real_pipeline_name = "" + if pipeline.pipeline_name != pipeline_name: + real_pipeline_name = f"({pipeline.pipeline_name})" fmt.echo("\nPipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue")) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index 3f70a72c0b..a396d0d600 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -71,7 +71,7 @@ def expect_no_pipeline_runs(self): # type: ignore[no-untyped-def] """Monkey patch pipeline.run during module loading Restore it once importing is done """ - old_run = dlt.Pipeline.run + original_run = dlt.Pipeline.run def noop(*args, **kwargs) -> LoadInfo: # type: ignore self.has_pipeline_auto_runs = True @@ -80,7 +80,7 @@ def noop(*args, **kwargs) -> LoadInfo: # type: ignore yield - dlt.Pipeline.run = old_run # type: ignore + dlt.Pipeline.run = original_run # type: ignore @property def pipeline_module(self) -> ModuleType: diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index 80d777f4f7..8d68b5c61c 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -4,13 +4,13 @@ @dlt.resource def quads_resource(): for idx in range(10): - yield {"id": idx, "quad": idx**4} + yield {"id": idx, "num": idx**4} @dlt.resource -def squares_resource(): +def numbers_resource(): for idx in range(10): - yield {"id": idx, "square": idx * idx} + yield {"id": idx, "num": idx + 1} @dlt.destination(loader_file_format="parquet") @@ -19,17 +19,17 @@ def null_sink(_items, _table) -> None: quads_resource_instance = quads_resource() -squares_resource_instance = squares_resource() +numbers_resource_instance = numbers_resource() quads_pipeline = dlt.pipeline( pipeline_name="numbers_quadruples_pipeline", destination=null_sink, ) -squares_pipeline = dlt.pipeline( +numbers_pipeline = dlt.pipeline( pipeline_name="numbers_pipeline", destination="duckdb", ) if __name__ == "__main__": - load_info = squares_pipeline.run(squares_resource(), schema=dlt.Schema("bobo-schema")) + load_info = numbers_pipeline.run(numbers_resource(), schema=dlt.Schema("bobo-schema")) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 1676aa2a3d..8cc0125a88 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -1,41 +1,45 @@ import io -import os import contextlib +import shutil from unittest import mock -import pytest +import dlt from dlt.cli import run_command -from tests.utils import TESTS_ROOT +from dlt.common.utils import set_working_dir +from tests.utils import TEST_STORAGE_ROOT, TESTS_ROOT -RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" -TEST_PIPELINE = RUNNER_PIPELINES / "pipeline.py" -TEST_PIPELINE_WITH_IMMEDIATE_RUN = RUNNER_PIPELINES / "pipeline_with_immediate_run.py" -TEST_PIPELINE_CONTENTS = open(RUNNER_PIPELINES / "pipeline.py").read().strip() +CLI_RUNNER_PIPELINES = TESTS_ROOT / "cli/cases/cli_runner" +TEST_PIPELINE = CLI_RUNNER_PIPELINES / "pipeline.py" +TEST_PIPELINE_WITH_IMMEDIATE_RUN = CLI_RUNNER_PIPELINES / "pipeline_with_immediate_run.py" -@pytest.fixture(scope="module") -def ch_pipeline_dir(): - cwd = os.getcwd() - os.chdir(RUNNER_PIPELINES) - yield - os.chdir(cwd) +def test_run_command_happy_path_works_as_expected(): + pipeline_name = "numbers_pipeline" + p = dlt.pipeline(pipeline_name=pipeline_name) + p._wipe_working_folder() + shutil.copytree(CLI_RUNNER_PIPELINES, TEST_STORAGE_ROOT, dirs_exist_ok=True) -def test_run_command_requires_working_directory_same_as_pipeline_working_directory(): - with io.StringIO() as buf, contextlib.redirect_stdout(buf): + with io.StringIO() as buf, contextlib.redirect_stdout(buf), set_working_dir(TEST_STORAGE_ROOT): run_command.run_pipeline_command( str(TEST_PIPELINE), - "squares_pipeline", - "squares_resource_instance", + pipeline_name, + "numbers_resource_instance", ["write_disposition=merge", "loader_file_format=jsonl"], ) output = buf.getvalue() assert "Current working directory is different from the pipeline script" in output - assert "If needed please change your current directory to" in output + assert "Pipeline: numbers_pipeline" in output + assert "Resource: numbers_resource_instance (numbers_resource)" in output + assert "Pipeline numbers_pipeline load step completed" in output + assert "contains no failed jobs" in output + + # Check if we can successfully attach to pipeline + dlt.attach(pipeline_name) def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(): From ca5aa99aafc3cea656291b6a931295811042efed Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 15:59:29 +0100 Subject: [PATCH 54/59] Cleanup tests --- dlt/common/cli/runner/inquirer.py | 2 +- tests/cli/cases/cli_runner/pipeline.py | 2 +- tests/cli/test_run_command.py | 16 +++++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/dlt/common/cli/runner/inquirer.py b/dlt/common/cli/runner/inquirer.py index 51165f6580..1143f1b677 100644 --- a/dlt/common/cli/runner/inquirer.py +++ b/dlt/common/cli/runner/inquirer.py @@ -64,7 +64,7 @@ def maybe_ask(self) -> t.Tuple[dlt.Pipeline, t.Union[DltResource, DltSource]]: real_pipeline_name = "" if pipeline.pipeline_name != pipeline_name: - real_pipeline_name = f"({pipeline.pipeline_name})" + real_pipeline_name = f" ({pipeline.pipeline_name})" fmt.echo("\nPipeline: " + fmt.style(pipeline_name + real_pipeline_name, fg="blue")) diff --git a/tests/cli/cases/cli_runner/pipeline.py b/tests/cli/cases/cli_runner/pipeline.py index 8d68b5c61c..5473744448 100644 --- a/tests/cli/cases/cli_runner/pipeline.py +++ b/tests/cli/cases/cli_runner/pipeline.py @@ -27,7 +27,7 @@ def null_sink(_items, _table) -> None: ) numbers_pipeline = dlt.pipeline( - pipeline_name="numbers_pipeline", + pipeline_name="my_numbers_pipeline", destination="duckdb", ) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 8cc0125a88..0a41083fbc 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -18,7 +18,9 @@ def test_run_command_happy_path_works_as_expected(): + # pipeline variable name pipeline_name = "numbers_pipeline" + real_pipeline_name = "my_numbers_pipeline" p = dlt.pipeline(pipeline_name=pipeline_name) p._wipe_working_folder() shutil.copytree(CLI_RUNNER_PIPELINES, TEST_STORAGE_ROOT, dirs_exist_ok=True) @@ -33,13 +35,13 @@ def test_run_command_happy_path_works_as_expected(): output = buf.getvalue() assert "Current working directory is different from the pipeline script" in output - assert "Pipeline: numbers_pipeline" in output + assert "Pipeline: numbers_pipeline (my_numbers_pipeline)" in output assert "Resource: numbers_resource_instance (numbers_resource)" in output - assert "Pipeline numbers_pipeline load step completed" in output + assert "Pipeline my_numbers_pipeline load step completed" in output assert "contains no failed jobs" in output # Check if we can successfully attach to pipeline - dlt.attach(pipeline_name) + dlt.attach(real_pipeline_name) def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(): @@ -47,18 +49,18 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no run_command.run_pipeline_command( str(TEST_PIPELINE), "pipeline_404", - "squares_resource_instance", + "numbers_resource_instance", ["write_disposition=merge", "loader_file_format=jsonl"], ) output = buf.getvalue() assert "Pipeline pipeline_404 has not been found in pipeline script" in output - assert "You can choose one of: quads_pipeline, squares_pipeline" in output + assert "You can choose one of: quads_pipeline, numbers_pipeline" in output with io.StringIO() as buf, contextlib.redirect_stdout(buf): run_command.run_pipeline_command( str(TEST_PIPELINE), - "squares_pipeline", + "numbers_pipeline", "resource_404", ["write_disposition=merge", "loader_file_format=jsonl"], ) @@ -68,7 +70,7 @@ def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_no "Source or resouce with name: resource_404 has not been found in pipeline script." in output ) - assert "You can choose one of: quads_resource_instance, squares_resource_instance" in output + assert "You can choose one of: quads_resource_instance, numbers_resource_instance" in output def test_run_command_allows_selection_of_pipeline_source_or_resource(): From ce0f5bb26f01665acee341389d5b36ba8ff506df Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 16:47:42 +0100 Subject: [PATCH 55/59] Add help message --- dlt/cli/_dlt.py | 79 ++++++++++++++++++++++++++--------- dlt/common/cli/runner/help.py | 21 ++++++++++ 2 files changed, 80 insertions(+), 20 deletions(-) create mode 100644 dlt/common/cli/runner/help.py diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index b8e0801095..7a289a4265 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -22,6 +22,7 @@ ) from dlt.cli.pipeline_command import pipeline_command, DLT_PIPELINE_COMMAND_DOCS_URL from dlt.cli.run_command import run_pipeline_command +from dlt.common.cli.runner.help import run_args_help from dlt.cli.telemetry_command import ( DLT_TELEMETRY_DOCS_URL, change_telemetry_status_command, @@ -136,7 +137,11 @@ def deploy_command_wrapper( @utils.track_command("pipeline", True, "operation") def pipeline_command_wrapper( - operation: str, pipeline_name: str, pipelines_dir: str, verbosity: int, **command_kwargs: Any + operation: str, + pipeline_name: str, + pipelines_dir: str, + verbosity: int, + **command_kwargs: Any, ) -> int: try: pipeline_command(operation, pipeline_name, pipelines_dir, verbosity, **command_kwargs) @@ -206,7 +211,11 @@ def __init__( help: str = None, # noqa ) -> None: super(TelemetryAction, self).__init__( - option_strings=option_strings, dest=dest, default=default, nargs=0, help=help + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help, ) def __call__( @@ -231,7 +240,11 @@ def __init__( help: str = None, # noqa ) -> None: super(NonInteractiveAction, self).__init__( - option_strings=option_strings, dest=dest, default=default, nargs=0, help=help + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help, ) def __call__( @@ -253,7 +266,11 @@ def __init__( help: str = None, # noqa ) -> None: super(DebugAction, self).__init__( - option_strings=option_strings, dest=dest, default=default, nargs=0, help=help + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help, ) def __call__( @@ -274,7 +291,9 @@ def main() -> int: formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( - "--version", action="version", version="%(prog)s {version}".format(version=__version__) + "--version", + action="version", + version="%(prog)s {version}".format(version=__version__), ) parser.add_argument( "--disable-telemetry", @@ -367,7 +386,9 @@ def main() -> int: "deploy", help="Creates a deployment package for a selected pipeline script" ) deploy_cmd.add_argument( - "pipeline_script_path", metavar="pipeline-script-path", help="Path to a pipeline script" + "pipeline_script_path", + metavar="pipeline-script-path", + help="Path to a pipeline script", ) deploy_sub_parsers = deploy_cmd.add_subparsers(dest="deployment_method") @@ -424,25 +445,37 @@ def main() -> int: ) deploy_cmd.add_argument("--help", "-h", nargs="?", const=True) deploy_cmd.add_argument( - "pipeline_script_path", metavar="pipeline-script-path", nargs=argparse.REMAINDER + "pipeline_script_path", + metavar="pipeline-script-path", + nargs=argparse.REMAINDER, ) schema = subparsers.add_parser("schema", help="Shows, converts and upgrades schemas") schema.add_argument( - "file", help="Schema file name, in yaml or json format, will autodetect based on extension" + "file", + help="Schema file name, in yaml or json format, will autodetect based on extension", ) schema.add_argument( - "--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format" + "--format", + choices=["json", "yaml"], + default="yaml", + help="Display schema in this format", ) schema.add_argument( - "--remove-defaults", action="store_true", help="Does not show default hint values" + "--remove-defaults", + action="store_true", + help="Does not show default hint values", ) pipe_cmd = subparsers.add_parser( "pipeline", help="Operations on pipelines that were ran locally" ) pipe_cmd.add_argument( - "--list-pipelines", "-l", default=False, action="store_true", help="List local pipelines" + "--list-pipelines", + "-l", + default=False, + action="store_true", + help="List local pipelines", ) pipe_cmd.add_argument( "--hot-reload", @@ -465,10 +498,12 @@ def main() -> int: pipe_cmd_sync_parent = argparse.ArgumentParser(add_help=False) pipe_cmd_sync_parent.add_argument( - "--destination", help="Sync from this destination when local pipeline state is missing." + "--destination", + help="Sync from this destination when local pipeline state is missing.", ) pipe_cmd_sync_parent.add_argument( - "--dataset-name", help="Dataset name to sync from when local pipeline state is missing." + "--dataset-name", + help="Dataset name to sync from when local pipeline state is missing.", ) pipeline_subparsers.add_parser( @@ -511,7 +546,9 @@ def main() -> int: help="Display schema in this format", ) pipe_cmd_schema.add_argument( - "--remove-defaults", action="store_true", help="Does not show default hint values" + "--remove-defaults", + action="store_true", + help="Does not show default hint values", ) pipe_cmd_drop = pipeline_subparsers.add_parser( @@ -553,7 +590,8 @@ def main() -> int: ) pipe_cmd_package = pipeline_subparsers.add_parser( - "load-package", help="Displays information on load package, use -v or -vv for more info" + "load-package", + help="Displays information on load package, use -v or -vv for more info", ) pipe_cmd_package.add_argument( "load_id", @@ -599,10 +637,7 @@ def main() -> int: "-a", nargs="+", default=[], - help=( - "Arguments passed to pipeline.run, example: " - "--args write_disposition=merge loader_file_format=jsonl" - ), + help=run_args_help, ) args = parser.parse_args() @@ -645,7 +680,11 @@ def main() -> int: return -1 else: return init_command_wrapper( - args.source, args.destination, args.generic, args.location, args.branch + args.source, + args.destination, + args.generic, + args.location, + args.branch, ) elif args.command == "deploy": try: diff --git a/dlt/common/cli/runner/help.py b/dlt/common/cli/runner/help.py new file mode 100644 index 0000000000..4bf8b112f3 --- /dev/null +++ b/dlt/common/cli/runner/help.py @@ -0,0 +1,21 @@ +from typing_extensions import get_args +from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.schema.typing import TSchemaEvolutionMode, TWriteDisposition + + +supported_formats = "|".join(get_args(TLoaderFileFormat)) +supported_evolution_modes = "|".join(get_args(TSchemaEvolutionMode)) +supported_write_disposition = "|".join(get_args(TWriteDisposition)) + +run_args_help = ( + "Arguments passed to pipeline.run, allowed options " + "destination=string,\n" + "staging=string,\n" + "credentials=string,\n" + "table_name=string,\n" + f"write_disposition={supported_write_disposition},\n" + "dataset_name=string,\n" + "primary_key=string,\n" + f"schema_contract={supported_evolution_modes},\n" + f"loader_file_format={supported_formats},\n" +) From 71c7c4a966f70c216fe8401c61679f790260aa11 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 16:49:59 +0100 Subject: [PATCH 56/59] Add simple validator for pipeline arguments --- dlt/common/cli/runner/pipeline_script.py | 53 ++++++++++++++++++++++++ dlt/common/cli/runner/runner.py | 1 + 2 files changed, 54 insertions(+) diff --git a/dlt/common/cli/runner/pipeline_script.py b/dlt/common/cli/runner/pipeline_script.py index a396d0d600..2009a9b603 100644 --- a/dlt/common/cli/runner/pipeline_script.py +++ b/dlt/common/cli/runner/pipeline_script.py @@ -5,6 +5,7 @@ from contextlib import contextmanager from types import ModuleType +from typing_extensions import get_args import dlt @@ -12,7 +13,9 @@ from dlt.cli.utils import parse_init_script from dlt.common.cli.runner.errors import RunnerError from dlt.common.cli.runner.types import PipelineMembers, RunnerParams +from dlt.common.destination.capabilities import TLoaderFileFormat from dlt.common.pipeline import LoadInfo +from dlt.common.schema.typing import TSchemaEvolutionMode, TWriteDisposition from dlt.sources import DltResource, DltSource @@ -43,6 +46,9 @@ def __init__(self, params: RunnerParams) -> None: """Directory in which pipeline script lives""" self.has_pipeline_auto_runs: bool = False + self.supported_formats = get_args(TLoaderFileFormat) + self.supported_evolution_modes = get_args(TSchemaEvolutionMode) + self.supported_write_disposition = get_args(TWriteDisposition) # Now we need to patch and store pipeline code visitor = parse_init_script( @@ -143,3 +149,50 @@ def run_arguments(self) -> t.Dict[str, str]: run_options[arg_name] = value return run_options + + def validate_arguments(self) -> None: + """Validates and checks""" + supported_args = { + "destination", + "staging", + "credentials", + "table_name", + "write_disposition", + "dataset_name", + "primary_key", + "schema_contract", + "loader_file_format", + } + errors = [] + arguments = self.run_arguments + for arg_name, _ in arguments.items(): + if arg_name not in supported_args: + errors.append(f"Invalid argument {arg_name}") + + if ( + "write_disposition" in arguments + and arguments["write_disposition"] not in self.supported_write_disposition + ): + errors.append( + "Invalid write disposition, select one of" + f" {'|'.join(self.supported_write_disposition)}" + ) + if ( + "loader_file_format" in arguments + and arguments["loader_file_format"] not in self.supported_formats + ): + errors.append( + f"Invalid loader file format, select one of {'|'.join(self.supported_formats)}" + ) + if ( + "schema_contract" in arguments + and arguments["schema_contract"] not in self.supported_evolution_modes + ): + errors.append( + "Invalid schema_contract mode, select one of" + f" {'|'.join(self.supported_evolution_modes)}" + ) + + if errors: + all_errors = "\n".join(errors) + raise RunnerError(f"One or more arguments are invalid:\n{all_errors}") diff --git a/dlt/common/cli/runner/runner.py b/dlt/common/cli/runner/runner.py index 476ec0393c..dc99c41e58 100644 --- a/dlt/common/cli/runner/runner.py +++ b/dlt/common/cli/runner/runner.py @@ -13,6 +13,7 @@ class PipelineRunner: def __init__(self, params: RunnerParams) -> None: self.params = params self.script = PipelineScript(params) + self.script.validate_arguments() self.params.pipeline_workdir = self.script.workdir self.inquirer = Inquirer(self.params, self.script.pipeline_members) From 26b43f37b9f0755c9903d535481b5a0e10ce5c71 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Mon, 25 Mar 2024 17:15:01 +0100 Subject: [PATCH 57/59] Use raw formatter --- dlt/cli/_dlt.py | 6 +++++- dlt/common/cli/runner/help.py | 25 ++++++++++++++----------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 7a289a4265..c0d9918ef3 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -603,7 +603,11 @@ def main() -> int: subparsers.add_parser("telemetry", help="Shows telemetry status") # CLI pipeline runner - run_cmd = subparsers.add_parser("run", help="Run pipelines in a given directory") + run_cmd = subparsers.add_parser( + "run", + help="Run pipelines in a given directory", + formatter_class=argparse.RawTextHelpFormatter, + ) run_cmd.add_argument( "module", diff --git a/dlt/common/cli/runner/help.py b/dlt/common/cli/runner/help.py index 4bf8b112f3..1847c8d5a0 100644 --- a/dlt/common/cli/runner/help.py +++ b/dlt/common/cli/runner/help.py @@ -1,4 +1,5 @@ from typing_extensions import get_args +from dlt.cli import echo as fmt from dlt.common.destination.capabilities import TLoaderFileFormat from dlt.common.schema.typing import TSchemaEvolutionMode, TWriteDisposition @@ -7,15 +8,17 @@ supported_evolution_modes = "|".join(get_args(TSchemaEvolutionMode)) supported_write_disposition = "|".join(get_args(TWriteDisposition)) -run_args_help = ( - "Arguments passed to pipeline.run, allowed options " - "destination=string,\n" - "staging=string,\n" - "credentials=string,\n" - "table_name=string,\n" - f"write_disposition={supported_write_disposition},\n" - "dataset_name=string,\n" - "primary_key=string,\n" - f"schema_contract={supported_evolution_modes},\n" - f"loader_file_format={supported_formats},\n" +run_args_help = "".join( + ( + "Supported arguments passed to pipeline.run are", + fmt.info_style("\n - destination=string"), + fmt.info_style("\n - staging=string,"), + fmt.info_style("\n - credentials=string,"), + fmt.info_style("\n - table_name=string,"), + fmt.info_style(f"\n - write_disposition={supported_write_disposition},"), + fmt.info_style("\n - dataset_name=string,"), + fmt.info_style("\n - primary_key=string,"), + fmt.info_style(f"\n - schema_contract={supported_evolution_modes},"), + fmt.info_style(f"\n - loader_file_format={supported_formats}"), + ) ) From 9d94febc4874f0c6e35776a24789229b89e7b77f Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Tue, 26 Mar 2024 09:51:11 +0100 Subject: [PATCH 58/59] Check if data persisted after the pipeline run --- tests/cli/test_run_command.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 0a41083fbc..83b4438386 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -41,7 +41,20 @@ def test_run_command_happy_path_works_as_expected(): assert "contains no failed jobs" in output # Check if we can successfully attach to pipeline - dlt.attach(real_pipeline_name) + pipeline = dlt.attach(real_pipeline_name) + assert pipeline.schema_names == ["my_numbers"] + + trace = pipeline.last_trace + assert trace is not None + assert len(trace.steps) == 4 + + step = trace.steps[0] + assert step.step == "extract" + + with pipeline.sql_client() as c: + with c.execute_query("select count(id) from numbers_resource") as cur: + row = list(cur.fetchall())[0] + assert row[0] == 10 def test_run_command_fails_with_relevant_error_if_pipeline_resource_or_source_not_found(): From a193408fd85cd9b32e58f1474a533cf69b515240 Mon Sep 17 00:00:00 2001 From: Sultan Iman Date: Tue, 26 Mar 2024 10:27:42 +0100 Subject: [PATCH 59/59] Add pipeline.run argument tests --- tests/cli/test_run_command.py | 52 +++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/cli/test_run_command.py b/tests/cli/test_run_command.py index 83b4438386..a379288655 100644 --- a/tests/cli/test_run_command.py +++ b/tests/cli/test_run_command.py @@ -2,9 +2,11 @@ import contextlib import shutil +from typing import List from unittest import mock import dlt +import pytest from dlt.cli import run_command @@ -128,3 +130,53 @@ def test_run_command_exits_if_pipeline_run_calls_exist_at_the_top_level(): ) output = buf.getvalue() assert "Please move all pipeline.run calls inside __main__ or remove them" in output + + +@pytest.mark.parametrize( + "arguments,expected_error_message", + [ + [ + ["write_disposition=merge", "loader_file_format=markdown"], + ( + "Invalid loader file format, select one of" + " jsonl|puae-jsonl|insert_values|sql|parquet|reference|arrow" + ), + ], + [ + ["write_disposition=guess", "loader_file_format=jsonl"], + "Invalid write disposition, select one of skip|append|replace|merge", + ], + [ + ["schema_contract=guess", "loader_file_format=jsonl"], + "Invalid schema_contract mode, select one of evolve|discard_value|freeze|discard_row", + ], + [ + ["yolo=yes", "moon=yes"], + "Invalid argument yolo\nInvalid argument moon", + ], + # Good case + [["write_disposition=append", "loader_file_format=parquet"], ""], + ], +) +def test_run_command_with_invalid_pipeline_run_arguments( + arguments: List[str], expected_error_message: str +): + pipeline_name = "numbers_pipeline" + p = dlt.pipeline(pipeline_name=pipeline_name) + p._wipe_working_folder() + shutil.copytree(CLI_RUNNER_PIPELINES, TEST_STORAGE_ROOT, dirs_exist_ok=True) + + with io.StringIO() as buf, contextlib.redirect_stdout(buf), set_working_dir(TEST_STORAGE_ROOT): + run_command.run_pipeline_command( + str(TEST_PIPELINE), + pipeline_name, + "numbers_resource_instance", + arguments, + ) + + output = buf.getvalue() + if expected_error_message: + assert expected_error_message in output + else: + # Check if we can attach to pipeline + dlt.attach("my_numbers_pipeline")