From 63c18fed82ad2618ae5f95274ceadf13a9fbe03c Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 14 Nov 2023 15:30:22 +0100 Subject: [PATCH 1/6] adds more name hashes to telemetry (#764) --- dlt/pipeline/track.py | 7 +++++-- .../dlt-ecosystem/destinations/snowflake.md | 1 + docs/website/docs/reference/telemetry.md | 5 ++++- tests/pipeline/test_pipeline_trace.py | 20 ++++++++++++++++++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index 8d3e9bfb98..ec42bc788f 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -3,8 +3,8 @@ from typing import Any import humanize -from dlt.common import pendulum -from dlt.common import logger +from dlt.common import pendulum, logger +from dlt.common.utils import digest128 from dlt.common.runtime.exec_info import github_info from dlt.common.runtime.segment import track as dlthub_telemetry_track from dlt.common.runtime.slack import send_slack_message @@ -88,6 +88,9 @@ def on_end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: S "elapsed": (step.finished_at - trace.started_at).total_seconds(), "success": step.step_exception is None, "destination_name": DestinationReference.to_name(pipeline.destination) if pipeline.destination else None, + "pipeline_name_hash": digest128(pipeline.pipeline_name), + "dataset_name_hash": digest128(pipeline.dataset_name) if pipeline.dataset_name else None, + "default_schema_name_hash": digest128(pipeline.default_schema_name) if pipeline.default_schema_name else None, "transaction_id": trace.transaction_id } # disable automatic slack messaging until we can configure messages themselves diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 5efc31dde8..40e54e426a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -69,6 +69,7 @@ You can also decrease the suspend time for your warehouse to 1 minute (**Admin** Snowflake destination accepts three authentication types - password authentication - [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) +- external authentication The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). diff --git a/docs/website/docs/reference/telemetry.md b/docs/website/docs/reference/telemetry.md index 359dc8588c..85a851152e 100644 --- a/docs/website/docs/reference/telemetry.md +++ b/docs/website/docs/reference/telemetry.md @@ -51,7 +51,7 @@ Anonymous telemetry is sent when: case of `dlt init` command, we also send the requested destination and data source names. - When `pipeline.run` is called, we send information when [extract, normalize and load](explainers/how-dlt-works.md) steps are completed. The data contains - the destination name (e.g. `duckdb`), destination fingerprint (which is a hash of selected destination configuration fields), elapsed time, and if the step succeeded or not. + the destination name (e.g. `duckdb`), hashes of: dataset name, pipeline name, default schema name, destination fingerprint (which is a hash of selected destination configuration fields), elapsed time, and if the step succeeded or not. - When `dbt` and `airflow` helpers are used Here is an example `dlt init` telemetry message: @@ -108,6 +108,9 @@ Example for `load` pipeline run step: "properties": { "destination_name": "duckdb", "destination_fingerprint": "", + "pipeline_name_hash": "OpVShb3cX7qQAmOZSbV8", + "dataset_name_hash": "Hqk0a3Ov5AD55KjSg2rC", + "default_schema_name_hash": "Hqk0a3Ov5AD55KjSg2rC", "elapsed": 2.234885, "event_category": "pipeline", "event_name": "load", diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index f778e79b01..706644b60e 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -16,6 +16,7 @@ from dlt.common.schema import Schema from dlt.common.runtime.telemetry import stop_telemetry from dlt.common.typing import DictStrAny, StrStr, DictStrStr, TSecretValue +from dlt.common.utils import digest128 from dlt.pipeline.exceptions import PipelineStepFailed from dlt.pipeline.pipeline import Pipeline @@ -231,12 +232,13 @@ def test_load_none_trace() -> None: def test_trace_telemetry() -> None: with patch("dlt.common.runtime.sentry.before_send", _mock_sentry_before_send), patch("dlt.common.runtime.segment.before_send", _mock_segment_before_send): + # os.environ["FAIL_PROB"] = "1.0" # make it complete immediately start_test_telemetry() SEGMENT_SENT_ITEMS.clear() SENTRY_SENT_ITEMS.clear() # default dummy fails all files - dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + load_info = dlt.pipeline().run([1,2,3], table_name="data", destination="dummy", dataset_name="data_data") # we should have 4 segment items assert len(SEGMENT_SENT_ITEMS) == 4 expected_steps = ["extract", "normalize", "load", "run"] @@ -244,6 +246,9 @@ def test_trace_telemetry() -> None: assert event["event"] == f"pipeline_{step}" assert event["properties"]["success"] is True assert event["properties"]["destination_name"] == "dummy" + assert event["properties"]["pipeline_name_hash"] == digest128(load_info.pipeline.pipeline_name) + assert event["properties"]["dataset_name_hash"] == digest128(load_info.pipeline.dataset_name) + assert event["properties"]["default_schema_name_hash"] == digest128(load_info.pipeline.default_schema_name) assert isinstance(event["properties"]["elapsed"], float) assert isinstance(event["properties"]["transaction_id"], str) # check extract info @@ -277,6 +282,19 @@ def data(): # we didn't log any errors assert len(SENTRY_SENT_ITEMS) == 0 + # trace without destination and dataset + p = dlt.pipeline(pipeline_name="fresh").drop() + SEGMENT_SENT_ITEMS.clear() + SENTRY_SENT_ITEMS.clear() + p.extract([1, 2, 3], table_name="data") + event = SEGMENT_SENT_ITEMS[0] + assert event["event"] == "pipeline_extract" + assert event["properties"]["success"] is True + assert event["properties"]["destination_name"] is None + assert event["properties"]["pipeline_name_hash"] == digest128("fresh") + assert event["properties"]["dataset_name_hash"] == digest128(p.dataset_name) + assert event["properties"]["default_schema_name_hash"] == digest128(p.default_schema_name) + def test_extract_data_describe() -> None: schema = Schema("test") From f72c331d1f98de289608c502bca3fa74a8c24833 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:40:58 +0100 Subject: [PATCH 2/6] Copy improvements in the SQL Database verified source (#749) * Copy improvements in the SQL Database verified source --- .../verified-sources/sql_database.md | 139 ++++++++++-------- .../docs/examples/nested_data/index.md | 6 +- 2 files changed, 79 insertions(+), 66 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index f219d7c0be..3a93ec1052 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -1,4 +1,4 @@ -# SQL Database +# 30+ SQL Databases :::info Need help deploying these sources, or figuring out how to run them in your data stack? @@ -9,9 +9,11 @@ or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval. -This SQL database `dlt` verified source and -[pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/sql_database_pipeline.py) -loads data using SqlAlchemy to the destination of your choice. +Our SQL Database verified source loads data to your specified destination using SQLAlchemy. + +:::tip +View the pipeline example [here](https://github.com/dlt-hub/verified-sources/blob/master/sources/sql_database_pipeline.py). +::: Sources and resources that can be loaded using this verified source are: @@ -20,18 +22,41 @@ Sources and resources that can be loaded using this verified source are: | sql_database | Retrieves data from an SQL database | | sql_table | Retrieves data from an SQL database table | +### Supported databases + +We support all [SQLAlchemy dialects](https://docs.sqlalchemy.org/en/20/dialects/), which include, but are not limited to, the following database engines: + +* PostgreSQL +* MySQL +* SQLite +* Oracle +* Microsoft SQL Server +* MariaDB +* IBM DB2 and Informix +* Google BigQuery +* Snowflake +* Redshift +* Apache Hive and Presto +* SAP Hana +* CockroachDB +* Firebird +* Teradata Vantage + +:::note +Note that there many unofficial dialects, such as [DuckDB](https://duckdb.org/). +::: + ## Setup Guide ### Grab credentials -This verified source utilizes SQLAlchemy for database connectivity. Let us consider this public -database example: +This verified source utilizes SQLAlchemy for database connectivity. Let's take a look at the following public database example: `connection_url = "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"` -> This public database doesn't require a password. +The database above doesn't require a password. -Connection URL can be broken down into: +The connection URL can be broken down into: ```python connection_url = "connection_string = f"{drivername}://{username}:{password}@{host}:{port}/{database}" @@ -52,34 +77,34 @@ connection_url = "connection_string = f"{drivername}://{username}:{password}@{ho - E.g., A public database at "mysql-rfam-public.ebi.ac.uk" hosted by EBI. -`port`: The port for the database connection. E.g., "4497", in the above connection URL. +`port`: The port for the database connection. + +- E.g., "4497", in the above connection URL. +`port`: The port for the database connection. + +- E.g., "4497", in the above connection URL. `database`: The specific database on the server. - E.g., Connecting to the "Rfam" database. -### Provide special options in connection string +### Configure connection -Here we use `mysql` and `pymysql` dialect to set up SSL connection to a server. All information -taken from the -[SQLAlchemy docs](https://docs.sqlalchemy.org/en/14/dialects/mysql.html#ssl-connections). +Here, we use the `mysql` and `pymysql` dialects to set up an SSL connection to a server, with all information taken from the [SQLAlchemy docs](https://docs.sqlalchemy.org/en/14/dialects/mysql.html#ssl-connections). -1. To force SSL on the client without a client certificate you may pass the following DSN: +1. To enforce SSL on the client without a client certificate you may pass the following DSN: ```toml sources.sql_database.credentials="mysql+pymysql://root:@:3306/mysql?ssl_ca=" ``` -1. You can also pass server public certificate as a file. For servers with a public certificate - (potentially bundled with your pipeline) and disabling host name checks: +1. You can also pass the server's public certificate (potentially bundled with your pipeline) and disable host name checks: ```toml sources.sql_database.credentials="mysql+pymysql://root:@:3306/mysql?ssl_ca=server-ca.pem&ssl_check_hostname=false" ``` -1. For servers requiring a client certificate, provide the client's private key (a secret value). In - Airflow, this is usually saved as a variable and exported to a file before use. Server cert is - omitted in the example below: +1. For servers requiring a client certificate, provide the client's private key (a secret value). In Airflow, this is usually saved as a variable and exported to a file before use. The server certificate is omitted in the example below: ```toml sources.sql_database.credentials="mysql+pymysql://root:@35.203.96.191:3306/mysql?ssl_ca=&ssl_cert=client-cert.pem&ssl_key=client-key.pem" @@ -95,13 +120,14 @@ To get started with your data pipeline, follow these steps: dlt init sql_database duckdb ``` - [This command](../../reference/command-line-interface) will initialize + It will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/sql_database_pipeline.py) - with SQL database as the [source](../../general-usage/source) and - [duckdb](../destinations/duckdb.md) as the [destination](../destinations). + with an SQL database as the [source](../../general-usage/source) and + [DuckDB](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your - preferred [destination](../destinations). + :::tip + If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). + ::: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. @@ -131,7 +157,7 @@ For more information, read the sources.sql_database.credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ``` -1. Alternatively, you can also pass credentials in the pipeline script like this: +1. You can also pass credentials in the pipeline script the following way: ```python credentials = ConnectionStringCredentials( @@ -143,8 +169,7 @@ For more information, read the > [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/sql_database_pipeline.py) > for details. -1. Finally, follow the instructions in [Destinations](../destinations/) to add credentials for your - chosen destination. This will ensure that your data is properly routed to its final destination. +1. Finally, follow the instructions in [Destinations](../destinations/) to add credentials for your chosen destination. This will ensure that your data is properly routed. For more information, read the [General Usage: Credentials.](../../general-usage/credentials) @@ -156,20 +181,23 @@ For more information, read the [General Usage: Credentials.](../../general-usage pip install -r requirements.txt ``` -1. Now the verified source can be run by using the command: +1. Run the verified source by entering: ```bash python sql_database_pipeline.py ``` -1. To make sure that everything is loaded as expected, use the command: +1. Make sure that everything is loaded as expected with: ```bash dlt pipeline show ``` - For example, the pipeline_name for the above pipeline example is `rfam`, you may also use any + :::note + The pipeline_name for the above example is `rfam`, you may also use any custom name instead. + ::: + ## Sources and resources @@ -179,7 +207,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage ### Source `sql_database`: This function loads data from an SQL database via SQLAlchemy and auto-creates resources for each -table or from a specified list. +table or from a specified list of tables. ```python @dlt.source @@ -191,13 +219,13 @@ def sql_database( ) -> Iterable[DltResource]: ``` -`credentials`: Database details or a 'sqlalchemy.Engine' instance. +`credentials`: Database details or an 'sqlalchemy.Engine' instance. `schema`: Database schema name (default if unspecified). -`metadata`: Optional, sqlalchemy.MetaData takes precedence over schema. +`metadata`: Optional SQLAlchemy.MetaData; takes precedence over schema. -`table_names`: List of tables to load. Defaults to all if not provided. +`table_names`: List of tables to load; defaults to all if not provided. ### Resource `sql_table` @@ -220,9 +248,9 @@ def sql_table( `table`: Table to load, set in code or default from "config.toml". -`schema`: Optional, name of table schema. +`schema`: Optional name of the table schema. -`metadata`: Optional, sqlalchemy.MetaData takes precedence over schema. +`metadata`: Optional SQLAlchemy.MetaData; takes precedence over schema. `incremental`: Optional, enables incremental loading. @@ -231,8 +259,7 @@ def sql_table( ## Customization ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +To create your own pipeline, use source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -244,7 +271,7 @@ verified source. ) ``` -1. You can pass credentials using any of the methods discussed above. +1. Pass your credentials using any of the methods [described above](#add-credentials). 1. To load the entire database, use the `sql_database` source as: @@ -254,9 +281,7 @@ verified source. print(info) ``` - > Use one method from the methods [described above](#add-credentials) to pass credentials. - -1. To load just the "family" table using the `sql_database` source: +1. If you just need the "family" table, use: ```python source = sql_database().with_resources("family") @@ -267,7 +292,7 @@ verified source. 1. To pseudonymize columns and hide personally identifiable information (PII), refer to the [documentation](https://dlthub.com/docs/general-usage/customising-pipelines/pseudonymizing_columns). - For example, to pseudonymize the "rfam_acc" column in the "family" table: + As an example, here's how to pseudonymize the "rfam_acc" column in the "family" table: ```python import hashlib @@ -299,7 +324,7 @@ verified source. print(info) ``` -1. To exclude the columns, for e.g. "rfam_id" column from the "family" table before loading: +1. To exclude columns, such as the "rfam_id" column from the "family" table before loading: ```python def remove_columns(doc): @@ -328,10 +353,7 @@ verified source. info = pipeline.run(source, write_disposition="merge") print(info) ``` - - > In this example, we load the "family" table and set the "updated" column for incremental - > loading. In the first run, it loads all the data from January 1, 2022, at midnight (00:00:00) and - > then loads incrementally in subsequent runs using "updated" field. + In this example, we load data from the `family` table, using the `updated` column for incremental loading. In the first run, the process loads all data starting from midnight (00:00:00) on January 1, 2022. Subsequent runs perform incremental loading, guided by the values in the `updated` field. 1. To incrementally load the "family" table using the 'sql_table' resource. @@ -347,20 +369,11 @@ verified source. print(info) ``` - > Loads all data from "family" table from January 1, 2022, at midnight (00:00:00) and then loads - > incrementally in subsequent runs using "updated" field. - - > 💡 Please note that to use merge write disposition a primary key must exist in the source table. - > `dlt` finds and sets up primary keys automatically. + This process initially loads all data from the `family` table starting at midnight on January 1, 2022. For later runs, it uses the `updated` field for incremental loading as well. - > 💡 `apply_hints` is a powerful method that allows to modify the schema of the resource after it - > was created: including the write disposition and primary keys. You are free to select many - > different tables and use `apply_hints` several times to have pipelines where some resources are - > merged, appended or replaced. + :::info + * For merge write disposition, the source table needs a primary key, which `dlt` automatically sets up. + * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appendend, or replaced resources. + ::: -1. Remember, to maintain the same pipeline name and destination dataset name. The pipeline name - retrieves the [state](https://dlthub.com/docs/general-usage/state) from the last run, essential - for incremental data loading. Changing these names might trigger a - [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh), - disrupting metadata tracking for - [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). +1. Remember to keep the pipeline name and destination dataset name consistent. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) from the last run, which is essential for incremental loading. Altering these names could initiate a "[full_refresh](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh)", interfering with the metadata tracking necessary for [incremental loads](https://dlthub.com/docs/general-usage/incremental-loading). diff --git a/docs/website/docs/examples/nested_data/index.md b/docs/website/docs/examples/nested_data/index.md index dd5db5009c..e479aede3f 100644 --- a/docs/website/docs/examples/nested_data/index.md +++ b/docs/website/docs/examples/nested_data/index.md @@ -62,13 +62,13 @@ def mongodb_collection( write_disposition: Optional[str] = dlt.config.value, ) -> Any: # set up mongo client - client = MongoClient(connection_url, uuidRepresentation="standard", tz_aware=True) + client: Any = MongoClient(connection_url, uuidRepresentation="standard", tz_aware=True) mongo_database = client.get_default_database() if not database else client[database] collection_obj = mongo_database[collection] def collection_documents( - client, - collection, + client: Any, + collection: Any, incremental: Optional[dlt.sources.incremental[Any]] = None, ) -> Iterator[TDataItem]: LoaderClass = CollectionLoader From e970741291aa8ae63bd0ad213e36047d704e1552 Mon Sep 17 00:00:00 2001 From: Simon Bumm <41942954+codingcyclist@users.noreply.github.com> Date: Fri, 17 Nov 2023 19:49:03 +0100 Subject: [PATCH 3/6] Autodetector for ISO date strings (#767) * Feat: iso_date auto-detection function * Feat: tests for iso_date auto-detection * Feat: make iso_date autodetection a default behavior * NB: update docs * fixup! Feat: tests for iso_date auto-detection * NB: Fix makefile * Fix: linting * Fix: support dates with reduced precision * fixup! Fix: support dates with reduced precision * Fix: don't make iso-date a default auto-detector --------- Co-authored-by: Skynet --- Makefile | 2 +- dlt/common/schema/detections.py | 19 +++++++++++ dlt/common/schema/typing.py | 2 +- .../schemas/dlt_quickstart.schema.yaml | 1 + .../schemas/dlt_quickstart.schema.yaml | 1 + docs/technical/working_with_schemas.md | 1 + docs/website/docs/general-usage/schema.md | 1 + .../cases/schemas/github/issues.schema.json | 5 +-- .../sheets/google_spreadsheet_v4.schema.json | 5 +-- tests/common/schema/test_detections.py | 34 ++++++++++++++++++- 10 files changed, 64 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index bd522c9ba3..85f67818ac 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ help: @echo " runs flake and mypy" @echo " test" @echo " tests all the components including destinations" - @echo " test-local" + @echo " test-load-local" @echo " tests all components unsing local destinations: duckdb and postgres" @echo " test-common" @echo " tests common components" diff --git a/dlt/common/schema/detections.py b/dlt/common/schema/detections.py index 574cb44c93..207c934091 100644 --- a/dlt/common/schema/detections.py +++ b/dlt/common/schema/detections.py @@ -36,6 +36,25 @@ def is_iso_timestamp(t: Type[Any], v: Any) -> Optional[TDataType]: return None +def is_iso_date(t: Type[Any], v: Any) -> Optional[TDataType]: + # only strings can be converted + if not issubclass(t, str): + return None + if not v: + return None + # don't cast iso timestamps as dates + if is_iso_timestamp(t,v): + return None + # strict autodetection of iso timestamps + try: + dtv = parse_iso_like_datetime(v) + if isinstance(dtv, datetime.date): + return "date" + except Exception: + pass + return None + + def is_large_integer(t: Type[Any], v: Any) -> Optional[TDataType]: # only ints can be converted if issubclass(t, int): diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 2cc057560c..ac17f0ae9f 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -25,7 +25,7 @@ """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg"] -TTypeDetections = Literal["timestamp", "iso_timestamp", "large_integer", "hexbytes_to_text", "wei_to_double"] +TTypeDetections = Literal["timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"] TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]] TColumnNames = Union[str, Sequence[str]] """A string representing a column name or a list of""" diff --git a/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml index 01cdb7f4ea..436aad7530 100644 --- a/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml +++ b/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml @@ -109,6 +109,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date default_hints: not_null: - _dlt_id diff --git a/docs/examples/archive/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/schemas/dlt_quickstart.schema.yaml index 003868bb96..3994ef7433 100644 --- a/docs/examples/archive/schemas/dlt_quickstart.schema.yaml +++ b/docs/examples/archive/schemas/dlt_quickstart.schema.yaml @@ -84,6 +84,7 @@ normalizers: detections: - timestamp - iso_timestamp + - iso_date names: dlt.common.normalizers.names.snake_case json: module: dlt.common.normalizers.json.relational diff --git a/docs/technical/working_with_schemas.md b/docs/technical/working_with_schemas.md index d8f048c172..d94edb8727 100644 --- a/docs/technical/working_with_schemas.md +++ b/docs/technical/working_with_schemas.md @@ -124,6 +124,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date ``` ⛔ we may define `all_text` function that will generate string only schemas by telling `dlt` that all types should be coerced to strings. diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 3e690634f3..13347b952b 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -182,6 +182,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date ``` ### Column hint rules diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json index 2760a20db0..4c4f5425ae 100644 --- a/tests/common/cases/schemas/github/issues.schema.json +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -1294,7 +1294,8 @@ "settings": { "detections": [ "timestamp", - "iso_timestamp" + "iso_timestamp", + "iso_date" ], "default_hints": { "not_null": [ @@ -1318,4 +1319,4 @@ "module": "dlt.common.normalizers.json.relational" } } -} \ No newline at end of file +} diff --git a/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json b/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json index b74a4a5c51..e3a1803371 100644 --- a/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json +++ b/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json @@ -387,11 +387,12 @@ "normalizers": { "detections": [ "timestamp", - "iso_timestamp" + "iso_timestamp", + "iso_date" ], "names": "dlt.common.normalizers.names.snake_case", "json": { "module": "dlt.common.normalizers.json.relational" } } -} \ No newline at end of file +} diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 3a74c6f368..13cb09faec 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -2,7 +2,7 @@ from dlt.common import pendulum, Decimal, Wei from dlt.common.schema.utils import autodetect_sc_type -from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS +from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_iso_date, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS def test_timestamp_detection() -> None: @@ -34,6 +34,36 @@ def test_iso_timestamp_detection() -> None: assert is_iso_timestamp(float, str(pendulum.now())) is None +def test_iso_date_detection() -> None: + assert is_iso_date(str, str(pendulum.now().date())) == "date" + assert is_iso_date(str, "1975-05-21") == "date" + assert is_iso_date(str, "19750521") == "date" + + # ISO-8601 allows dates with reduced precision + assert is_iso_date(str, "1975-05") == "date" + assert is_iso_date(str, "1975") == "date" + + # dont auto-detect timestamps as dates + assert is_iso_date(str, str(pendulum.now())) is None + assert is_iso_date(str, "1975-05-21T22:00:00Z") is None + assert is_iso_date(str, "2022-06-01T00:48:35.040Z") is None + assert is_iso_date(str, "1975-0521T22:00:00Z") is None + assert is_iso_date(str, "2021-07-24 10:51") is None + + # times are not accepted + assert is_iso_date(str, "22:00:00") is None + # wrong formats + assert is_iso_date(str, "197505") is None + assert is_iso_date(str, "0-05-01") is None + assert is_iso_date(str, "") is None + assert is_iso_date(str, "75") is None + assert is_iso_date(str, "01-12") is None + assert is_iso_date(str, "1975/05/01") is None + + # wrong type + assert is_iso_date(float, str(pendulum.now().date())) is None + + def test_detection_large_integer() -> None: assert is_large_integer(str, "A") is None assert is_large_integer(int, 2**64 // 2) == "wei" @@ -56,6 +86,8 @@ def test_detection_function() -> None: assert autodetect_sc_type(None, str, str(pendulum.now())) is None assert autodetect_sc_type(["iso_timestamp"], str, str(pendulum.now())) == "timestamp" assert autodetect_sc_type(["iso_timestamp"], float, str(pendulum.now())) is None + assert autodetect_sc_type(["iso_date"], str, str(pendulum.now().date())) == "date" + assert autodetect_sc_type(["iso_date"], float, str(pendulum.now().date())) is None assert autodetect_sc_type(["timestamp"], str, str(pendulum.now())) is None assert autodetect_sc_type(["timestamp", "iso_timestamp"], float, pendulum.now().timestamp()) == "timestamp" assert autodetect_sc_type(["timestamp", "large_integer"], int, 2**64) == "wei" From b743b98ecf2be4c4baec2b76ad010fb8106cd9e9 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sat, 18 Nov 2023 18:20:24 +0100 Subject: [PATCH 4/6] pipeline drop pending packages (#771) * adds debug flag to cli to show exceptions * handles empty pipelines in drop command * adds method to drop pending packages + cli * adds docs --- dlt/cli/_dlt.py | 57 ++- dlt/cli/pipeline_command.py | 40 +- dlt/common/storages/file_storage.py | 2 +- dlt/common/storages/load_storage.py | 35 +- dlt/destinations/dummy/configuration.py | 3 + dlt/destinations/dummy/dummy.py | 15 +- dlt/load/load.py | 4 +- dlt/pipeline/exceptions.py | 8 + dlt/pipeline/helpers.py | 5 +- dlt/pipeline/pipeline.py | 18 +- docs/technical/secrets_and_config.md | 436 ------------------ .../credentials/config_providers.md | 2 +- .../general-usage/credentials/config_specs.md | 2 +- .../credentials/configuration.md | 13 +- .../docs/reference/command-line-interface.md | 16 + .../cases/deploy_pipeline/dummy_pipeline.py | 20 + tests/cli/common/test_cli_invoke.py | 43 +- tests/cli/test_pipeline_command.py | 41 +- tests/common/storages/test_loader_storage.py | 48 +- tests/load/pipeline/test_drop.py | 17 +- tests/load/test_dummy_client.py | 10 +- tests/normalize/test_normalize.py | 6 +- tests/pipeline/test_pipeline.py | 26 ++ 23 files changed, 348 insertions(+), 519 deletions(-) delete mode 100644 docs/technical/secrets_and_config.md create mode 100644 tests/cli/cases/deploy_pipeline/dummy_pipeline.py diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index f719c30de0..dfda2966b9 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -25,13 +25,22 @@ pass +DEBUG_FLAG = False + + +def on_exception(ex: Exception, info: str) -> None: + click.secho(str(ex), err=True, fg="red") + fmt.note("Please refer to %s for further assistance" % fmt.bold(info)) + if DEBUG_FLAG: + raise ex + + @utils.track_command("init", False, "source_name", "destination_name") def init_command_wrapper(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str) -> int: try: init_command(source_name, destination_name, use_generic_template, repo_location, branch) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_INIT_DOCS_URL)) + on_exception(ex, DLT_INIT_DOCS_URL) return -1 return 0 @@ -41,8 +50,7 @@ def list_verified_sources_command_wrapper(repo_location: str, branch: str) -> in try: list_verified_sources_command(repo_location, branch) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_INIT_DOCS_URL)) + on_exception(ex, DLT_INIT_DOCS_URL) return -1 return 0 @@ -66,9 +74,8 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re **kwargs ) except (CannotRestorePipelineException, PipelineWasNotRun) as ex: - click.secho(str(ex), err=True, fg="red") fmt.note("You must run the pipeline locally successfully at least once in order to deploy it.") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_DEPLOY_DOCS_URL)) + on_exception(ex, DLT_DEPLOY_DOCS_URL) return -2 except InvalidGitRepositoryError: click.secho( @@ -89,10 +96,8 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re ) return -4 except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_DEPLOY_DOCS_URL)) + on_exception(ex, DLT_DEPLOY_DOCS_URL) return -5 - # TODO: display stack trace if with debug flag return 0 @@ -106,10 +111,10 @@ def pipeline_command_wrapper( except CannotRestorePipelineException as ex: click.secho(str(ex), err=True, fg="red") click.secho("Try command %s to restore the pipeline state from destination" % fmt.bold(f"dlt pipeline {pipeline_name} sync")) - return 1 + return -1 except Exception as ex: - click.secho(str(ex), err=True, fg="red") - return 1 + on_exception(ex, DLT_PIPELINE_COMMAND_DOCS_URL) + return -2 @utils.track_command("schema", False, "operation") @@ -133,8 +138,7 @@ def telemetry_status_command_wrapper() -> int: try: telemetry_status_command() except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_TELEMETRY_DOCS_URL)) + on_exception(ex, DLT_TELEMETRY_DOCS_URL) return -1 return 0 @@ -144,8 +148,7 @@ def telemetry_change_status_command_wrapper(enabled: bool) -> int: try: change_telemetry_status_command(enabled) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_TELEMETRY_DOCS_URL)) + on_exception(ex, DLT_TELEMETRY_DOCS_URL) return -1 return 0 @@ -186,12 +189,28 @@ def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespac fmt.ALWAYS_CHOOSE_DEFAULT = True +class DebugAction(argparse.Action): + def __init__(self, option_strings: Sequence[str], dest: Any = argparse.SUPPRESS, default: Any = argparse.SUPPRESS, help: str = None) -> None: # noqa + super(DebugAction, self).__init__( + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help + ) + def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Any, option_string: str = None) -> None: + global DEBUG_FLAG + # will show stack traces (and maybe more debug things) + DEBUG_FLAG = True + + def main() -> int: parser = argparse.ArgumentParser(description="Creates, adds, inspects and deploys dlt pipelines.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', action="version", version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('--disable-telemetry', action=TelemetryAction, help="Disables telemetry before command is executed") parser.add_argument('--enable-telemetry', action=TelemetryAction, help="Enables telemetry before command is executed") parser.add_argument('--non-interactive', action=NonInteractiveAction, help="Non interactive mode. Default choices are automatically made for confirmations and prompts.") + parser.add_argument('--debug', action=DebugAction, help="Displays full stack traces on exceptions.") subparsers = parser.add_subparsers(dest="command") init_cmd = subparsers.add_parser("init", help="Creates a pipeline project in the current folder by adding existing verified source or creating a new one from template.") @@ -239,8 +258,6 @@ def main() -> int: pipe_cmd.add_argument("pipeline_name", nargs='?', help="Pipeline name") pipe_cmd.add_argument("--pipelines-dir", help="Pipelines working directory", default=None) pipe_cmd.add_argument("--verbose", "-v", action='count', default=0, help="Provides more information for certain commands.", dest="verbosity") - # pipe_cmd.add_argument("--dataset-name", help="Dataset name used to sync destination when local pipeline state is missing.") - # pipe_cmd.add_argument("--destination", help="Destination name used to sync when local pipeline state is missing.") pipeline_subparsers = pipe_cmd.add_subparsers(dest="operation", required=False) @@ -251,6 +268,7 @@ def main() -> int: pipeline_subparsers.add_parser("info", help="Displays state of the pipeline, use -v or -vv for more info") pipeline_subparsers.add_parser("show", help="Generates and launches Streamlit app with the loading status and dataset explorer") pipeline_subparsers.add_parser("failed-jobs", help="Displays information on all the failed loads in all completed packages, failed jobs and associated error messages") + pipeline_subparsers.add_parser("drop-pending-packages", help="Deletes all extracted and normalized packages including those that are partially loaded.") pipeline_subparsers.add_parser( "sync", help="Drops the local state of the pipeline and resets all the schemas and restores it from destination. The destination state, data and schemas are left intact.", @@ -290,6 +308,9 @@ def main() -> int: return pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity) else: command_kwargs = dict(args._get_kwargs()) + if not command_kwargs.get("pipeline_name"): + pipe_cmd.print_usage() + return -1 command_kwargs['operation'] = args.operation or "info" del command_kwargs["command"] del command_kwargs["list_pipelines"] diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 52a9c8ffdc..b17981c1b1 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -1,5 +1,5 @@ import yaml -from typing import Any +from typing import Any, Sequence, Tuple import dlt from dlt.cli.exceptions import CliCommandException @@ -9,8 +9,7 @@ from dlt.common.runners import Venv from dlt.common.runners.stdout import iter_stdout from dlt.common.schema.utils import group_tables_by_resource, remove_defaults -from dlt.common.storages.file_storage import FileStorage -from dlt.common.typing import DictStrAny +from dlt.common.storages import FileStorage, LoadStorage from dlt.pipeline.helpers import DropCommand from dlt.pipeline.exceptions import CannotRestorePipelineException @@ -33,6 +32,8 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver return try: + if verbosity > 0: + fmt.echo("Attaching to pipeline %s" % fmt.bold(pipeline_name)) p = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=pipelines_dir) except CannotRestorePipelineException as e: if operation not in {"sync", "drop"}: @@ -52,6 +53,22 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver if operation == "sync": return # No need to sync again + def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: + extracted_files = p.list_extracted_resources() + if extracted_files: + fmt.echo("Has %s extracted files ready to be normalized" % fmt.bold(str(len(extracted_files)))) + norm_packages = p.list_normalized_load_packages() + if norm_packages: + fmt.echo("Has %s load packages ready to be loaded with following load ids:" % fmt.bold(str(len(norm_packages)))) + for load_id in norm_packages: + fmt.echo(load_id) + # load first (oldest) package + first_package_info = p.get_load_package_info(norm_packages[0]) + if LoadStorage.is_package_partially_loaded(first_package_info): + fmt.warning("This package is partially loaded. Data in the destination may be modified.") + fmt.echo() + return extracted_files, norm_packages + fmt.echo("Found pipeline %s in %s" % (fmt.bold(p.pipeline_name), fmt.bold(p.pipelines_dir))) if operation == "show": @@ -102,15 +119,7 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver fmt.echo("%s with %s table(s) and %s resource state slot(s)" % (fmt.bold(resource_name), fmt.bold(str(len(tables))), fmt.bold(str(res_state_slots)))) fmt.echo() fmt.echo("Working dir content:") - extracted_files = p.list_extracted_resources() - if extracted_files: - fmt.echo("Has %s extracted files ready to be normalized" % fmt.bold(str(len(extracted_files)))) - norm_packages = p.list_normalized_load_packages() - if norm_packages: - fmt.echo("Has %s load packages ready to be loaded with following load ids:" % fmt.bold(str(len(norm_packages)))) - for load_id in norm_packages: - fmt.echo(load_id) - fmt.echo() + _display_pending_packages() loaded_packages = p.list_completed_load_packages() if loaded_packages: fmt.echo("Has %s completed load packages with following load ids:" % fmt.bold(str(len(loaded_packages)))) @@ -148,6 +157,13 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver else: fmt.echo("No failed jobs found") + if operation == "drop-pending-packages": + extracted_files, norm_packages = _display_pending_packages() + if len(extracted_files) == 0 and len(norm_packages) == 0: + fmt.echo("No pending packages found") + if fmt.confirm("Delete the above packages?", default=False): + p.drop_pending_packages(with_partial_loads=True) + fmt.echo("Pending packages deleted") if operation == "sync": if fmt.confirm("About to drop the local state of the pipeline and reset all the schemas. The destination state, data and schemas are left intact. Proceed?", default=False): diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 006ff4843d..3c5a391200 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -125,7 +125,7 @@ def has_folder(self, relative_path: str) -> bool: return os.path.isdir(self.make_full_path(relative_path)) def list_folder_files(self, relative_path: str, to_root: bool = True) -> List[str]: - """List all files in ``relative_path`` folder + """List all files in `relative_path` folder Args: relative_path (str): A path to folder, relative to storage root diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index d034ef239a..d8eee9b8d6 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -27,7 +27,7 @@ # folders to manage load jobs in a single load package TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] -WORKING_FOLDERS = set(get_args(TJobState)) +WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) TLoadPackageState = Literal["normalized", "loaded", "aborted"] @@ -193,7 +193,7 @@ def write_temp_job_file(self, load_id: str, table_name: str, table: TTableSchema def load_package_schema(self, load_id: str) -> Schema: # load schema from a load package to be processed - schema_path = join(self.get_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) + schema_path = join(self.get_normalized_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) return self._load_schema(schema_path) def load_temp_schema(self, load_id: str) -> Schema: @@ -211,14 +211,16 @@ def save_temp_schema_updates(self, load_id: str, schema_update: TSchemaTables) - json.dump(schema_update, f) def commit_temp_load_package(self, load_id: str) -> None: - self.storage.rename_tree(load_id, self.get_package_path(load_id)) + self.storage.rename_tree(load_id, self.get_normalized_package_path(load_id)) - def list_packages(self) -> Sequence[str]: + def list_normalized_packages(self) -> Sequence[str]: + """Lists all packages that are normalized and will be loaded or are currently loaded""" loads = self.storage.list_folder_dirs(LoadStorage.NORMALIZED_FOLDER, to_root=False) # start from the oldest packages return sorted(loads) def list_completed_packages(self) -> Sequence[str]: + """List packages that are completely loaded""" loads = self.storage.list_folder_dirs(LoadStorage.LOADED_FOLDER, to_root=False) # start from the oldest packages return sorted(loads) @@ -264,7 +266,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: # check if package is completed or in process package_created_at: DateTime = None package_state: TLoadPackageState = "normalized" - package_path = self.get_package_path(load_id) + package_path = self.get_normalized_package_path(load_id) applied_update: TSchemaTables = {} if not self.storage.has_folder(package_path): package_path = self.get_completed_package_path(load_id) @@ -291,7 +293,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: return LoadPackageInfo(load_id, self.storage.make_full_path(package_path), package_state, schema.name, applied_update, package_created_at, all_jobs) def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: - package_path = self.get_package_path(load_id) + package_path = self.get_normalized_package_path(load_id) if not self.storage.has_folder(package_path): raise FileNotFoundError(package_path) schema_update_file = join(package_path, LoadStorage.SCHEMA_UPDATES_FILE_NAME) @@ -303,7 +305,7 @@ def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> None: """Marks schema update as processed and stores the update that was applied at the destination""" - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) schema_update_file = join(load_path, LoadStorage.SCHEMA_UPDATES_FILE_NAME) processed_schema_update_file = join(load_path, LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) # delete initial schema update @@ -344,7 +346,7 @@ def complete_job(self, load_id: str, file_name: str) -> str: return self._move_job(load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.COMPLETED_JOBS_FOLDER, file_name) def complete_load_package(self, load_id: str, aborted: bool) -> None: - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) has_failed_jobs = len(self.list_failed_jobs(load_id)) > 0 # delete completed jobs if self.config.delete_completed_jobs and not has_failed_jobs: @@ -367,7 +369,7 @@ def delete_completed_package(self, load_id: str) -> None: def wipe_normalized_packages(self) -> None: self.storage.delete_folder(self.NORMALIZED_FOLDER, recursively=True) - def get_package_path(self, load_id: str) -> str: + def get_normalized_package_path(self, load_id: str) -> str: return join(LoadStorage.NORMALIZED_FOLDER, load_id) def get_completed_package_path(self, load_id: str) -> str: @@ -378,7 +380,7 @@ def job_elapsed_time_seconds(self, file_path: str, now_ts: float = None) -> floa def _save_schema(self, schema: Schema, load_id: str) -> str: dump = json.dumps(schema.to_dict()) - schema_path = join(self.get_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) + schema_path = join(self.get_normalized_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) return self.storage.save(schema_path, dump) def _load_schema(self, schema_path: str) -> Schema: @@ -388,14 +390,14 @@ def _load_schema(self, schema_path: str) -> Schema: def _move_job(self, load_id: str, source_folder: TJobState, dest_folder: TJobState, file_name: str, new_file_name: str = None) -> str: # ensure we move file names, not paths assert file_name == FileStorage.get_file_name_from_file_path(file_name) - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) dest_path = join(load_path, dest_folder, new_file_name or file_name) self.storage.atomic_rename(join(load_path, source_folder, file_name), dest_path) # print(f"{join(load_path, source_folder, file_name)} -> {dest_path}") return self.storage.make_full_path(dest_path) def _get_job_folder_path(self, load_id: str, folder: TJobState) -> str: - return join(self.get_package_path(load_id), folder) + return join(self.get_normalized_package_path(load_id), folder) def _get_job_file_path(self, load_id: str, folder: TJobState, file_name: str) -> str: return join(self._get_job_folder_path(load_id, folder), file_name) @@ -430,6 +432,15 @@ def build_job_file_name(self, table_name: str, file_id: str, retry_count: int = return fn + f".{format_spec.file_extension}" return fn + @staticmethod + def is_package_partially_loaded(package_info: LoadPackageInfo) -> bool: + """Checks if package is partially loaded - has jobs that are not new.""" + if package_info.state == "normalized": + pending_jobs: Sequence[TJobState] = ["new_jobs"] + else: + pending_jobs = ["completed_jobs", "failed_jobs"] + return sum(len(package_info.jobs[job_state]) for job_state in WORKING_FOLDERS if job_state not in pending_jobs) > 0 + @staticmethod def parse_job_file_name(file_name: str) -> ParsedLoadJobFileName: p = Path(file_name) diff --git a/dlt/destinations/dummy/configuration.py b/dlt/destinations/dummy/configuration.py index 79cbe3e41e..1a8072300c 100644 --- a/dlt/destinations/dummy/configuration.py +++ b/dlt/destinations/dummy/configuration.py @@ -20,6 +20,8 @@ class DummyClientConfiguration(DestinationClientConfiguration): fail_prob: float = 0.0 retry_prob: float = 0.0 completed_prob: float = 0.0 + exception_prob: float = 0.0 + """probability of exception when checking job status""" timeout: float = 10.0 fail_in_init: bool = True @@ -35,6 +37,7 @@ def __init__( fail_prob: float = None, retry_prob: float = None, completed_prob: float = None, + exception_prob: float = None, timeout: float = None, fail_in_init: bool = None, ) -> None: diff --git a/dlt/destinations/dummy/dummy.py b/dlt/destinations/dummy/dummy.py index c8cac05d3a..92827405ca 100644 --- a/dlt/destinations/dummy/dummy.py +++ b/dlt/destinations/dummy/dummy.py @@ -24,17 +24,20 @@ def __init__(self, file_name: str, config: DummyClientConfiguration) -> None: self._exception: str = None self.start_time: float = pendulum.now().timestamp() super().__init__(file_name) - # if config.fail_in_init: - s = self.state() - if s == "failed": - raise DestinationTerminalException(self._exception) - if s == "retry": - raise DestinationTransientException(self._exception) + if config.fail_in_init: + s = self.state() + if s == "failed": + raise DestinationTerminalException(self._exception) + if s == "retry": + raise DestinationTransientException(self._exception) def state(self) -> TLoadJobState: # this should poll the server for a job status, here we simulate various outcomes if self._status == "running": + c_r = random.random() + if self.config.exception_prob >= c_r: + raise DestinationTransientException("Dummy job status raised exception") n = pendulum.now().timestamp() if n - self.start_time > self.config.timeout: self._status = "failed" diff --git a/dlt/load/load.py b/dlt/load/load.py index d27274ffb1..beae130789 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -387,7 +387,7 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: logger.info("Running file loading") # get list of loads and order by name ASC to execute schema updates - loads = self.load_storage.list_packages() + loads = self.load_storage.list_normalized_packages() logger.info(f"Found {len(loads)} load packages") if len(loads) == 0: return TRunMetrics(True, 0) @@ -404,7 +404,7 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: with self.collector(f"Load {schema.name} in {load_id}"): self.load_single_package(load_id, schema) - return TRunMetrics(False, len(self.load_storage.list_packages())) + return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) def get_load_info(self, pipeline: SupportsPipeline, started_at: datetime.datetime = None) -> LoadInfo: # TODO: LoadInfo should hold many datasets diff --git a/dlt/pipeline/exceptions.py b/dlt/pipeline/exceptions.py index 4b283a17e7..0289c07158 100644 --- a/dlt/pipeline/exceptions.py +++ b/dlt/pipeline/exceptions.py @@ -55,6 +55,14 @@ def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: ) super().__init__(pipeline_name, msg) +class PipelineNeverRan(PipelineException): + def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: + msg = ( + f" Operation failed because pipeline with name {pipeline_name} in working directory {pipelines_dir} was never run or never synced with destination. " + "Use `dlt pipeline sync` to synchronize." + ) + super().__init__(pipeline_name, msg) + class PipelineNotActive(PipelineException): def __init__(self, pipeline_name: str) -> None: diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index ef4fe70664..ebb85f5e23 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -11,7 +11,7 @@ from dlt.common.destination.reference import WithStagingDataset from dlt.destinations.exceptions import DatabaseUndefinedRelation -from dlt.pipeline.exceptions import PipelineStepFailed, PipelineHasPendingDataException +from dlt.pipeline.exceptions import PipelineNeverRan, PipelineStepFailed, PipelineHasPendingDataException from dlt.pipeline.typing import TPipelineStep from dlt.pipeline import Pipeline @@ -71,6 +71,9 @@ def __init__( if isinstance(state_paths, str): state_paths = [state_paths] + if not pipeline.default_schema_name: + raise PipelineNeverRan(pipeline.pipeline_name, pipeline.pipelines_dir) + self.schema = pipeline.schemas[schema_name or pipeline.default_schema_name].clone() self.schema_tables = self.schema.tables self.drop_tables = not state_only diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index e6e27afec7..b948ad8040 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -595,7 +595,7 @@ def has_data(self) -> bool: @property def has_pending_data(self) -> bool: """Tells if the pipeline contains any extracted files or pending load packages""" - return bool(self.list_normalized_load_packages() or self.list_extracted_resources()) + return len(self.list_normalized_load_packages()) > 0 or len(self.list_extracted_resources()) > 0 @property def schemas(self) -> SchemaStorage: @@ -623,7 +623,7 @@ def list_extracted_resources(self) -> Sequence[str]: def list_normalized_load_packages(self) -> Sequence[str]: """Returns a list of all load packages ids that are or will be loaded.""" - return self._get_load_storage().list_packages() + return self._get_load_storage().list_normalized_packages() def list_completed_load_packages(self) -> Sequence[str]: """Returns a list of all load package ids that are completely loaded""" @@ -637,6 +637,20 @@ def list_failed_jobs_in_package(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a specified `load_id`""" return self._get_load_storage().get_load_package_info(load_id).jobs.get("failed_jobs", []) + def drop_pending_packages(self, with_partial_loads: bool = True) -> None: + """Deletes all extracted and normalized packages, including those that are partially loaded by default""" + # delete normalized packages + load_storage = self._get_load_storage() + for load_id in load_storage.list_normalized_packages(): + package_info = load_storage.get_load_package_info(load_id) + if LoadStorage.is_package_partially_loaded(package_info) and not with_partial_loads: + continue + package_path = load_storage.get_normalized_package_path(load_id) + load_storage.storage.delete_folder(package_path, recursively=True) + # delete extracted files + normalize_storage = self._get_normalize_storage() + normalize_storage.delete_extracted_files(normalize_storage.list_files_to_normalize_sorted()) + @with_schemas_sync def sync_schema(self, schema_name: str = None, credentials: Any = None) -> TSchemaTables: """Synchronizes the schema `schema_name` with the destination. If no name is provided, the default schema will be synchronized.""" diff --git a/docs/technical/secrets_and_config.md b/docs/technical/secrets_and_config.md deleted file mode 100644 index 423767293d..0000000000 --- a/docs/technical/secrets_and_config.md +++ /dev/null @@ -1,436 +0,0 @@ -# Secrets and Configs -marks features that are: - -⛔ not implemented, hard to add - -☮️ not implemented, easy to add - -## General Usage and an Example -The way config values and secrets are handled should promote correct behavior - -1. secret values should never be present in the pipeline code -2. pipeline may be reconfigured for production after it is deployed. deployed and local code should be identical -3. still it must be easy and intuitive - -For the source extractor function below (reads selected tab from google sheets) we can pass config values in following ways: - -```python - -import dlt - - -@dlt.source -def google_sheets(spreadsheet_id, tab_names=dlt.config.value, credentials=dlt.secrets.value, only_strings=False): - sheets = build('sheets', 'v4', credentials=Services.from_json(credentials)) - tabs = [] - for tab_name in tab_names: - data = sheets.get(spreadsheet_id, tab_name).execute().values() - tabs.append(dlt.resource(data, name=tab_name)) - return tabs - -# WRONG: provide all values directly - wrong but possible. secret values should never be present in the code! -google_sheets("23029402349032049", ["tab1", "tab2"], credentials={"private_key": ""}).run(destination="bigquery") - -# OPTION A: provide config values directly and secrets via automatic injection mechanism (see later) -# `credentials` value will be injected by the `source` decorator -# `spreadsheet_id` and `tab_names` take values from the arguments below -# `only_strings` will be injected by the source decorator or will get the default value False -google_sheets("23029402349032049", ["tab1", "tab2"]).run(destination="bigquery") - - -# OPTION B: use `dlt.secrets` and `dlt.config` to explicitly take those values from providers from the explicit keys -google_sheets(dlt.config["sheet_id"], dlt.config["my_section.tabs"], dlt.secrets["my_section.gcp_credentials"]).run(destination="bigquery") -``` - -> one of the principles is that configuration, credentials and secret values are may be passed explicitly as arguments to the functions. this makes the injection behavior optional. - -## Injection mechanism -Config and secret values are injected to the function arguments if the function is decorated with `@dlt.source` or `@dlt resource` (also `@with_config` which you can applu to any function - used havily in the dlt core) - -The signature of the function `google_sheets` is **explicitly accepting all the necessary configuration and secrets in its arguments**. During runtime, `dlt` tries to supply (`inject`) the required values via various config providers. The injection rules are: -1. if you call the decorated function, the arguments that are passed explicitly are **never injected** -this makes injection mechanism optional - -2. required arguments (ie. `spreadsheet_id`, `tab_names`) are not injected -3. arguments with default values are injected if present in config providers -4. arguments with the special default value `dlt.secrets.value` and `dlt.config.value` **must be injected** (or expicitly passed). If they are not found by the config providers the code raises exception. The code in the functions always receives those arguments. - -additionally `dlt.secrets.value` tells `dlt` that supplied value is a secret and it will be injected only from secure config providers - -## Passing config values and credentials explicitly - -```python -# OPTION B: use `dlt.secrets` and `dlt.config` to explicitly take those values from providers from the explicit keys -google_sheets(dlt.config["sheet_id"], dlt.config["tabs"], dlt.secrets["my_section.gcp_credentials"]).run(destination="bigquery") -``` - -[See example](/docs/examples/credentials/explicit.py) - - -## Typing the source and resource signatures - -You should type your function signatures! The effort is very low and it gives `dlt` much more information on what source/resource expects. -1. You'll never receive invalid type signatures -2. We can generate nice sample config and secret files for your source -3. You can request dictionaries or special values (ie. connection strings, service json) to be passed -4. ☮️ you can specify a set of possible types via `Union` ie. OAUTH or Api Key authorization - -```python -@dlt.source -def google_sheets(spreadsheet_id: str, tab_names: List[str] = dlt.config.value, credentials: GcpServiceAccountCredentials = dlt.secrets.value, only_strings: bool = False): - ... -``` -Now: -1. you are sure that you get a list of strings as `tab_names` -2. you will get actual google credentials (see `CredentialsConfiguration` later) and your users can pass them in many different forms. - -In case of `GcpServiceAccountCredentials` -* you may just pass the `service_json` as string or dictionary (in code and via config providers) -* you may pass a connection string (used in sql alchemy) (in code and via config providers) -* or default credentials will be used - - -## Providers -If function signature has arguments that may be injected, `dlt` looks for the argument values in providers. **The argument name is a key in the lookup**. In case of `google_sheets()` it will look for: `tab_names`, `credentials` and `strings_only`. - -Each provider has its own key naming convention and dlt is able to translate between them. - -Providers form a hierarchy. At the top are environment variables, then `secrets.toml` and `config.toml` files. Providers like google, aws, azure vaults can be inserted after the environment provider. - -For example if `spreadsheet_id` is in environment, dlt does not look into other providers. - -The values passed in the code explitly are the **highest** in provider hierarchy. -The default values of the arguments have the **lowest** priority in the provider hierarchy. - -> **Summary of the hierarchy** -> explicit args > env variables > ...vaults, airflow etc > secrets.toml > config.toml > default arg values - -Secrets are handled only by the providers supporting them. Some of the providers support only secrets (to reduce the number of requests done by `dlt` when searching sections) -1. `secrets.toml` and environment may hold both config and secret values -2. `config.toml` may hold only config values, no secrets -3. various vaults providers hold only secrets, `dlt` skips them when looking for values that are not secrets. - -⛔ Context aware providers will activate in right environments ie. on Airflow or AWS/GCP VMachines - -### Provider key formats. toml vs. environment variable - -Providers may use diffent formats for the keys. `dlt` will translate the standard format where sections and key names are separated by "." into the provider specific formats. - -1. for `toml` names are case sensitive and sections are separated with "." -2. for environment variables all names are capitalized and sections are separated with double underscore "__" - -Example: -When `dlt` evaluates the request `dlt.secrets["my_section.gcp_credentials"]` it must find the `private_key` for google credentials. It will look -1. first in env variable `MY_SECTION__GCP_CREDENTIALS__PRIVATE_KEY` and if not found -2. in `secrets.toml` with key `my_section.gcp_credentials.private_key` - - -### Environment provider -Looks for the values in the environment variables - -### Toml provider -Tomls provider uses two `toml` files: `secrets.toml` to store secrets and `config.toml` to store configuration values. The default `.gitignore` file prevents secrets from being added to source control and pushed. The `config.toml` may be freely added. - -**Toml provider always loads those files from `.dlt` folder** which is looked **relative to the current working directory**. Example: -if your working dir is `my_dlt_project` and you have: -``` -my_dlt_project: - | - pipelines/ - |---- .dlt/secrets.toml - |---- google_sheets.py -``` -in it and you run `python pipelines/google_sheets.py` then `dlt` will look for `secrets.toml` in `my_dlt_project/.dlt/secrets.toml` and ignore the existing `my_dlt_project/pipelines/.dlt/secrets.toml` - -if you change your working dir to `pipelines` and run `python google_sheets.py` it will look for `my_dlt_project/pipelines/.dlt/secrets.toml` a (probably) expected. - -*that was common problem on our workshop - but believe me all other layouts are even worse I've tried* - - -## Secret and config values layout. -`dlt` uses an layout of hierarchical sections to organize the config and secret values. This makes configurations and secrets easy to manage and disambiguates values with the same keys by placing them in the different sections - -> if you know how `toml` files are organized -> this is the same concept! - -> a lot of config values are dictionaries themselves (ie. most of the credentials) and you want the values corresponding to one component to be close together. - -> you can have a separate credentials for your destinations and each of source your pipeline uses, if you have many pipelines in single project, you can have a separate sections corresponding to them. - -Here is the simplest default layout for our `google_sheets` example. - -### OPTION A (default layout) - -**secrets.toml** -```toml -[credentials] -client_email = -private_key = -project_id = -``` -**config.toml** -```toml -tab_names=["tab1", "tab2"] -``` - -As you can see the details of gcp credentials are placed under `credentials` which is argument name to source function - -### OPTION B (explicit layout) - -Here user has full control over the layout - -**secrets.toml** -```toml -[my_section] - - [my_section.gcp_credentials] - client_email = - private_key = -``` -**config.toml** -```toml -[my_section] -tabs=["tab1", "tab2"] - - [my_section.gcp_credentials] - project_id = # I prefer to keep my project id in config file and private key in secrets -``` - -### Default layout and default key lookup during injection - -`dlt` arranges the sections into **default layout** that is used by injection mechanism. This layout makes it easy to configure simple cases but also provides a room for more explicit sections and complex cases ie. having several soures with different credentials or even hosting several pipelines in the same project sharing the same config and credentials. - -``` -pipeline_name - | - |-sources - |- - |- - |- {all source and resource options and secrets} - |- - |- {all source and resource options and secrets} - |- - |... - - |-extract - |- extract options for resources ie. parallelism settings, maybe retries - |-destination - |- - |- {destination options} - |-credentials - |-{credentials options} - |-schema - |- - |-schema settings: not implemented but I'll let people set nesting level, name convention, normalizer etc. here - |-load - |-normalize -``` - -Lookup rules: - -**Rule 1** All the sections above are optional. You are free to arrange your credentials and config without any additional sections -Example: OPTION A (default layout) - -**Rule 2** The lookup starts with the most specific possible path and if value is not found there, it removes the right-most section and tries again. -Example: In case of option A we have just one credentials. But what if `bigquery` credentials are different from `google sheets`? Then we need to allow some sections to separate them. - -```toml -# google sheet credentials -[credentials] -client_email = -private_key = -project_id = - -# bigquery credentials -[destination.credentials] -client_email = -private_key = -project_id = -``` -Now when `dlt` looks for destination credentials, it will encounter the `destination` section and stop there. -When looking for `sources` credentials it will get directly into `credentials` key (corresponding to function argument) - -> we could also rename the argument in the source function! but then we are **forcing** the user to have two copies of credentials. - -Example: let's be even more explicit and use full section path possible -```toml -# google sheet credentials -[sources.google_sheets.credentials] -client_email = -private_key = -project_id = - -# bigquery credentials -[destination.bigquery.credentials] -client_email = -private_key = -project_id = -``` -Where we add destination and source name to be very explicit. - -**Rule 3** You can use your pipeline name to have separate configurations for each pipeline in your project - -Pipeline created/obtained with `dlt.pipeline()` creates a global and optional namespace with the value of `pipeline_name`. All config values will be looked with pipeline name first and then again without it. - -Example: the pipeline is named `ML_sheets` -```toml -[ML_sheets.credentials] -client_email = -private_key = -project_id = -``` - -or maximum path: -```toml -[ML_sheets.sources.google_sheets.credentials] -client_email = -private_key = -project_id = -``` - -### The `sources` section -Config and secrets for decorated sources and resources are kept in `sources..` section. **All sections are optionsl**. For example if source module is named -`pipedrive` and the function decorated with `@dlt.source` is `deals(api_key: str=...)` then `dlt` will look for api key in: -1. `sources.pipedrive.deals.api_key` -2. `sources.pipedrive.api_key` -3. `sources.api_key` -4. `api_key` - -Step 2 in search path allows all the sources/resources in a module to share the same set of credentials. - -Also look at the following [test](/tests/extract/test_decorators.py) : `test_source_sections` - -## Understanding the exceptions -Now we can finally understand the `ConfigFieldMissingException`. Let's run `chess.py` example without providing the password: - -``` -$ CREDENTIALS="postgres://loader@localhost:5432/dlt_data" python chess.py -... -dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['password'] in configuration with spec PostgresCredentials - for field "password" config providers and keys were tried in following order: - In Environment Variables key CHESS_GAMES__DESTINATION__POSTGRES__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CHESS_GAMES__DESTINATION__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CHESS_GAMES__CREDENTIALS__PASSWORD was not found. - In secrets.toml key chess_games.destination.postgres.credentials.password was not found. - In secrets.toml key chess_games.destination.credentials.password was not found. - In secrets.toml key chess_games.credentials.password was not found. - In Environment Variables key DESTINATION__POSTGRES__CREDENTIALS__PASSWORD was not found. - In Environment Variables key DESTINATION__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CREDENTIALS__PASSWORD was not found. - In secrets.toml key destination.postgres.credentials.password was not found. - In secrets.toml key destination.credentials.password was not found. - In secrets.toml key credentials.password was not found. -Please refer to https://dlthub.com/docs/general-usage/credentials for more information -``` - -It tells you exactly which paths `dlt` looked at, via which config providers and in which order. In the example above -1. First it looked in a big section `chess_games` which is name of the pipeline -2. In each case it starts with full paths and goes to minimum path `credentials.password` -3. First it looks into `environ` then in `secrets.toml`. It displays the exact keys tried. -4. Note that `config.toml` was skipped! It may not contain any secrets. - - -## Working with credentials (and other complex configuration values) - -`GcpServiceAccountCredentials` is an example of a **spec**: a Python `dataclass` that describes the configuration fields, their types and default values. It also allows to parse various native representations of the configuration. Credentials marked with `WithDefaults` mixin are also to instantiate itself from the machine/user default environment ie. googles `default()` or AWS `.aws/credentials`. - -As an example, let's use `ConnectionStringCredentials` which represents a database connection string. - -```python -@dlt.source -def query(sql: str, dsn: ConnectionStringCredentials = dlt.secrets.value): - ... -``` - -The source above executes the `sql` against database defined in `dsn`. `ConnectionStringCredentials` makes sure you get the correct values with correct types and understands the relevant native form of the credentials. - - -Example 1: use the dictionary form -```toml -[dsn] -database="dlt_data" -password="loader" -username="loader" -host="localhost" -``` - -Example:2: use the native form -```toml -dsn="postgres://loader:loader@localhost:5432/dlt_data" -``` - -Example 3: use mixed form: the password is missing in explicit dsn and will be taken from the `secrets.toml` -```toml -dsn.password="loader -``` -```python -query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") -# or -query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"...}) -``` - -☮️ We will implement more credentials and let people reuse them when writing pipelines: -- to represent oauth credentials -- api key + api secret -- AWS credentials - - -### Working with alternatives of credentials (Union types) -If your source/resource allows for many authentication methods you can support those seamlessly for your user. The user just passes the right credentials and `dlt` will inject the right type into your decorated function. - -Example: - -> read the whole [test](/tests/common/configuration/test_spec_union.py), it shows how to create unions of credentials that derive from the common class so you can handle it seamlessly in your code. - -```python -@dlt.source -def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): - # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in `credentials` argument - # both classes implement `auth` so you can always call it - credentials.auth() - return dlt.resource([credentials], name="credentials") - -# pass native value -os.environ["CREDENTIALS"] = "email:mx:pwd" -assert list(zen_source())[0].email == "mx" - -# pass explicit native value -assert list(zen_source("secret:🔑:secret"))[0].api_secret == "secret" - -# pass explicit dict -assert list(zen_source(credentials={"email": "emx", "password": "pass"}))[0].email == "emx" - -``` -> This applies not only to credentials but to all specs (see next chapter) - -## Writing own specs - -**specs** let you tak full control over the function arguments: -- which values should be injected, the types, default values. -- you can specify optional and final fields -- form hierarchical configurations (specs in specs). -- provide own handlers for `on_error` or `on_resolved` -- provide own native value parsers -- provide own default credentials logic -- adds all Python dataclass goodies to it -- adds all Python `dict` goodies to it (`specs` instances can be created from dicts and serialized from dicts) - -This is used a lot in the `dlt` core and may become useful for complicated sources. - -In fact for each decorated function a spec is synthesized. In case of `google_sheets` following class is created. -```python -@configspec -class GoogleSheetsConfiguration: - tab_names: List[str] = None # manadatory - credentials: GcpServiceAccountCredentials = None # mandatory secret - only_strings: Optional[bool] = False -``` - -> all specs derive from [BaseConfiguration](/dlt/common/configuration/specs//base_configuration.py) - -> all credentials derive from [CredentialsConfiguration](/dlt/common/configuration/specs//base_configuration.py) - -> Read the docstrings in the code above - -## Interesting / Advanced stuff. - -The approach above makes configs and secrets explicit and autogenerates required lookups. It lets me for example **generate deployments** and **code templates for pipeline scripts** automatically because I know what are the config parameters and I have total control over users code and final values via the decorator. diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index b3da2979a9..1edd6a6e9a 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -1,6 +1,6 @@ --- title: Configuration Providers -description: Configuration dlt Providers +description: Where dlt looks for config/secrets and in which order. keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables, provider] --- diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index 328d18d2a0..07e56b3e14 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -1,6 +1,6 @@ --- title: Configuration Specs -description: Overview configuration specs and how to create custom specs +description: How to specify complex custom configurations keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables, specs] --- diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md index a92fb6fd0c..4cb3e17468 100644 --- a/docs/website/docs/general-usage/credentials/configuration.md +++ b/docs/website/docs/general-usage/credentials/configuration.md @@ -1,6 +1,6 @@ --- title: Secrets and Configs -description: Overview secrets and configs +description: What are secrets and configs and how sources and destinations read them. keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables] --- @@ -11,7 +11,7 @@ Secrets and configs are two types of sensitive and non-sensitive information use 1. **Configs**: - Configs refer to non-sensitive configuration data. These are settings, parameters, or options that define the behavior of a data pipeline. - - They can include things like file paths, database connection strings, API endpoints, or any other settings that affect the pipeline's behavior. + - They can include things like file paths, database hosts and timeouts, API endpoints, or any other settings that affect the pipeline's behavior. 2. **Secrets**: - Secrets are sensitive information that should be kept confidential, such as passwords, API keys, private keys, and other confidential data. - It's crucial to never hard-code secrets directly into the code, as it can pose a security risk. Instead, they should be stored securely and accessed via a secure mechanism. @@ -210,6 +210,15 @@ You can pass destination credentials and ignore the default lookup: pipeline = dlt.pipeline(destination="postgres", credentials=dlt.secrets["postgres_dsn"]) ``` +:::Note +**dlt.config** and **dlt.secrets** can be also used as setters. For example: +```python +dlt.config["sheet_id"] = "23029402349032049" +dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra +``` +Will mock the **toml** provider to desired values. +::: + ## Injection mechanism Config and secret values are injected to the function arguments if the function is decorated with diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index a2516a41de..d774d5faa6 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -236,3 +236,19 @@ dlt pipeline --list-pipelines This command lists all the pipelines executed on the local machine with their working data in the default pipelines folder. + +### Drop pending and partially loaded packages +```sh +dlt pipeline drop-pending-packages +``` +Removes all extracted and normalized packages in the pipeline's working dir. +`dlt` keeps extracted and normalized load packages in pipeline working directory. When `run` method is called, it will attempt to normalize and load +pending packages first. The command above removes such packages. Note that **pipeline state** is not reverted to the state at which the deleted package +were created. Use `dlt pipeline ... sync` is recommended if your destination supports state sync. + + +## Show stack traces +If the command fails and you want to see the full stack trace add `--debug` just after `dlt` executable. +```sh +dlt --debug pipeline github info +``` diff --git a/tests/cli/cases/deploy_pipeline/dummy_pipeline.py b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py new file mode 100644 index 0000000000..48e13c35cd --- /dev/null +++ b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py @@ -0,0 +1,20 @@ +import dlt + + +@dlt.resource +def example_resource(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id=0): + yield [api_url, api_key, str(last_id), "param4", "param5"] + + +@dlt.source +def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id = 0): + # return all the resources to be loaded + return example_resource(api_url, api_key, last_id) + + +if __name__ == '__main__': + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination="dummy") + load_info = p.run( + example_source(last_id=819273998) + ) + print(load_info) diff --git a/tests/cli/common/test_cli_invoke.py b/tests/cli/common/test_cli_invoke.py index 99f34eeaa7..e3a7676ad1 100644 --- a/tests/cli/common/test_cli_invoke.py +++ b/tests/cli/common/test_cli_invoke.py @@ -1,8 +1,13 @@ import os +import shutil +from subprocess import CalledProcessError +import pytest from pytest_console_scripts import ScriptRunner from unittest.mock import patch +import dlt from dlt.common.configuration.paths import get_dlt_data_dir +from dlt.common.runners.venv import Venv from dlt.common.utils import custom_environ, set_working_dir from dlt.common.pipeline import get_dlt_pipelines_dir @@ -35,7 +40,7 @@ def test_invoke_basic(script_runner: ScriptRunner) -> None: def test_invoke_list_pipelines(script_runner: ScriptRunner) -> None: result = script_runner.run(['dlt', 'pipeline', '--list-pipelines']) # directory does not exist (we point to TEST_STORAGE) - assert result.returncode == 1 + assert result.returncode == -2 # create empty os.makedirs(get_dlt_pipelines_dir()) @@ -43,11 +48,45 @@ def test_invoke_list_pipelines(script_runner: ScriptRunner) -> None: assert result.returncode == 0 assert "No pipelines found in" in result.stdout + +def test_invoke_pipeline(script_runner: ScriptRunner) -> None: # info on non existing pipeline result = script_runner.run(['dlt', 'pipeline', 'debug_pipeline', 'info']) - assert result.returncode == 1 + assert result.returncode == -1 assert "the pipeline was not found in" in result.stderr + # copy dummy pipeline + p = dlt.pipeline(pipeline_name="dummy_pipeline") + p._wipe_working_folder() + + shutil.copytree("tests/cli/cases/deploy_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + with set_working_dir(TEST_STORAGE_ROOT): + with custom_environ({"COMPETED_PROB": "1.0", "DLT_DATA_DIR": get_dlt_data_dir()}): + venv = Venv.restore_current() + venv.run_script("dummy_pipeline.py") + # we check output test_pipeline_command else + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'info']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'trace']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'failed-jobs']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + assert result.returncode == -2 + try: + # use debug flag to raise an exception + result = script_runner.run(['dlt', '--debug', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + # exception terminates command + assert result.returncode == 1 + assert "LoadPackageNotFound" in result.stderr + finally: + # reset debug flag so other tests may pass + from dlt.cli import _dlt + _dlt.DEBUG_FLAG = False + def test_invoke_init_chess_and_template(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): diff --git a/tests/cli/test_pipeline_command.py b/tests/cli/test_pipeline_command.py index 1ffc0c66aa..401517f3c5 100644 --- a/tests/cli/test_pipeline_command.py +++ b/tests/cli/test_pipeline_command.py @@ -1,6 +1,7 @@ import io import os import contextlib +import pytest from subprocess import CalledProcessError import dlt @@ -142,7 +143,6 @@ def test_pipeline_command_failed_jobs(repo_dir: str, project_files: FileStorage) try: pipeline = dlt.attach(pipeline_name="chess_pipeline") - print(pipeline.working_dir) pipeline.drop() except Exception as e: print(e) @@ -168,3 +168,42 @@ def test_pipeline_command_failed_jobs(repo_dir: str, project_files: FileStorage) _out = buf.getvalue() # actual failed job data assert "JOB file type: jsonl" in _out + + +def test_pipeline_command_drop_partial_loads(repo_dir: str, project_files: FileStorage) -> None: + init_command.init_command("chess", "dummy", False, repo_dir) + + try: + pipeline = dlt.attach(pipeline_name="chess_pipeline") + pipeline.drop() + except Exception as e: + print(e) + + # now run the pipeline + os.environ["EXCEPTION_PROB"] = "1.0" + os.environ["FAIL_IN_INIT"] = "False" + os.environ["TIMEOUT"] = "1.0" + venv = Venv.restore_current() + with pytest.raises(CalledProcessError) as cpe: + print(venv.run_script("chess_pipeline.py")) + assert "Dummy job status raised exception" in cpe.value.stdout + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + pipeline_command.pipeline_command("info", "chess_pipeline", None, 1) + _out = buf.getvalue() + # one package is partially loaded + assert 'This package is partially loaded' in _out + print(_out) + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + with echo.always_choose(False, True): + pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) + _out = buf.getvalue() + assert 'Pending packages deleted' in _out + print(_out) + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) + _out = buf.getvalue() + assert 'No pending packages found' in _out + print(_out) \ No newline at end of file diff --git a/tests/common/storages/test_loader_storage.py b/tests/common/storages/test_loader_storage.py index 4f2d0193fe..1acfeb873b 100644 --- a/tests/common/storages/test_loader_storage.py +++ b/tests/common/storages/test_loader_storage.py @@ -26,12 +26,12 @@ def test_complete_successful_package(storage: LoadStorage) -> None: # should delete package in full storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.complete_job(load_id, file_name) assert_package_info(storage, load_id, "normalized", "completed_jobs") storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has package assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) @@ -47,7 +47,7 @@ def test_complete_successful_package(storage: LoadStorage) -> None: storage.complete_job(load_id, file_name) storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has load preserved assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) @@ -59,22 +59,45 @@ def test_complete_successful_package(storage: LoadStorage) -> None: def test_wipe_normalized_packages(storage: LoadStorage) -> None: load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - storage.wipe_normalized_packages() - assert not storage.storage.has_folder(storage.NORMALIZED_FOLDER) +def test_is_partially_loaded(storage: LoadStorage) -> None: + load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}], start_job=False) + info = storage.get_load_package_info(load_id) + # all jobs are new + assert LoadStorage.is_package_partially_loaded(info) is False + # start job + storage.start_job(load_id, file_name) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + # complete job + storage.complete_job(load_id, file_name) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + # must complete package + storage.complete_load_package(load_id, False) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is False + + # abort package + load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) + storage.complete_load_package(load_id, True) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + + def test_complete_package_failed_jobs(storage: LoadStorage) -> None: # loads with failed jobs are always persisted storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.fail_job(load_id, file_name, "EXCEPTION") assert_package_info(storage, load_id, "normalized", "failed_jobs") storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # present in completed loads folder assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) # has completed loads @@ -105,7 +128,7 @@ def test_abort_package(storage: LoadStorage) -> None: # loads with failed jobs are always persisted storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.fail_job(load_id, file_name, "EXCEPTION") assert_package_info(storage, load_id, "normalized", "failed_jobs") storage.complete_load_package(load_id, True) @@ -195,7 +218,7 @@ def test_process_schema_update(storage: LoadStorage) -> None: storage.commit_schema_update(load_id, applied_update) assert storage.begin_schema_update(load_id) is None # processed file exists - applied_update_path = os.path.join(storage.get_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) + applied_update_path = os.path.join(storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) assert storage.storage.has_file(applied_update_path) is True assert json.loads(storage.storage.load(applied_update_path)) == applied_update # verify info package @@ -237,7 +260,7 @@ def test_unknown_migration_path() -> None: LoadStorage(False, "jsonl", LoadStorage.ALL_SUPPORTED_FILE_FORMATS) -def start_loading_file(s: LoadStorage, content: Sequence[StrAny]) -> Tuple[str, str]: +def start_loading_file(s: LoadStorage, content: Sequence[StrAny], start_job: bool = True) -> Tuple[str, str]: load_id = uniq_id() s.create_temp_load_package(load_id) # write test file @@ -247,8 +270,9 @@ def start_loading_file(s: LoadStorage, content: Sequence[StrAny]) -> Tuple[str, s.save_temp_schema_updates(load_id, {}) s.commit_temp_load_package(load_id) assert_package_info(s, load_id, "normalized", "new_jobs") - s.start_job(load_id, file_name) - assert_package_info(s, load_id, "normalized", "started_jobs") + if start_job: + s.start_job(load_id, file_name) + assert_package_info(s, load_id, "normalized", "started_jobs") return load_id, file_name diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index a2714674be..2a20db62b4 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -1,3 +1,4 @@ +import os from typing import Any, Iterator, Dict, Any, List from unittest import mock from itertools import chain @@ -9,7 +10,7 @@ from dlt.common.utils import uniq_id from dlt.pipeline import helpers, state_sync, Pipeline from dlt.load import Load -from dlt.pipeline.exceptions import PipelineStepFailed +from dlt.pipeline.exceptions import PipelineHasPendingDataException, PipelineNeverRan, PipelineStepFailed from dlt.destinations.job_client_impl import SqlJobClientBase from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -186,7 +187,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) def test_load_step_fails(destination_config: DestinationTestConfiguration) -> None: - """Test idempotency. pipeline.load() fails. Command can be run again successfully""" + """Test idempotence. pipeline.load() fails. Command can be run again successfully""" source = droppable_source() pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) pipeline.run(source) @@ -292,3 +293,15 @@ def test_drop_state_only(destination_config: DestinationTestConfiguration) -> No assert_dropped_resource_tables(attached, []) # No tables dropped assert_dropped_resource_states(attached, ['droppable_a', 'droppable_b']) assert_destination_state_loaded(attached) + + +def test_drop_first_run_and_pending_packages() -> None: + """Attempts to drop before pipeline runs and when partial loads happen""" + pipeline = dlt.pipeline('drop_test_' + uniq_id(), destination="dummy") + with pytest.raises(PipelineNeverRan): + helpers.drop(pipeline, "droppable_a") + os.environ["COMPLETED_PROB"] = "1.0" + pipeline.run(droppable_source().with_resources("droppable_a")) + pipeline.extract(droppable_source().with_resources("droppable_b")) + with pytest.raises(PipelineHasPendingDataException): + helpers.drop(pipeline, "droppable_a") \ No newline at end of file diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index e7e0166177..1216906967 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -184,7 +184,7 @@ def test_spool_job_failed_exception_init() -> None: def test_spool_job_failed_exception_complete() -> None: # this config fails job on start - os.environ["RAISE_ON_FAILED_JOBS"] = "true" + os.environ["LOAD__RAISE_ON_FAILED_JOBS"] = "true" os.environ["FAIL_IN_INIT"] = "false" load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) load_id, _ = prepare_load_package( @@ -340,7 +340,7 @@ def test_retry_on_new_loop() -> None: assert len(files) == 0 # complete package load.run(pool) - assert not load.load_storage.storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) # parse the completed job names completed_path = load.load_storage.get_completed_package_path(load_id) for fn in load.load_storage.storage.list_folder_files(os.path.join(completed_path, LoadStorage.COMPLETED_JOBS_FOLDER)): @@ -382,7 +382,7 @@ def test_load_single_thread() -> None: metrics = load.run(None) while metrics.pending_items > 0: metrics = load.run(None) - assert not load.load_storage.storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) def test_wrong_writer_type() -> None: @@ -417,11 +417,11 @@ def assert_complete_job(load: Load, storage: FileStorage, should_delete_complete with ThreadPoolExecutor() as pool: load.run(pool) # did process schema update - assert storage.has_file(os.path.join(load.load_storage.get_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME)) + assert storage.has_file(os.path.join(load.load_storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME)) # will finalize the whole package load.run(pool) # moved to loaded - assert not storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) completed_path = load.load_storage._get_job_folder_completed_path(load_id, "completed_jobs") if should_delete_completed: # package was deleted diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index 2484b1ea61..12b6267a59 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -226,7 +226,7 @@ def test_normalize_many_schemas(caps: DestinationCapabilitiesContext, rasa_norma with ProcessPoolExecutor(max_workers=4) as p: rasa_normalize.run(p) # must have two loading groups with model and event schemas - loads = rasa_normalize.load_storage.list_packages() + loads = rasa_normalize.load_storage.list_normalized_packages() assert len(loads) == 2 schemas = [] # load all schemas @@ -247,7 +247,7 @@ def test_normalize_typed_json(caps: DestinationCapabilitiesContext, raw_normaliz extract_items(raw_normalize.normalize_storage, [JSON_TYPED_DICT], "special", "special") with ThreadPoolExecutor(max_workers=1) as pool: raw_normalize.run(pool) - loads = raw_normalize.load_storage.list_packages() + loads = raw_normalize.load_storage.list_normalized_packages() assert len(loads) == 1 # load all schemas schema = raw_normalize.load_storage.load_package_schema(loads[0]) @@ -438,7 +438,7 @@ def get_line_from_file(load_storage: LoadStorage, loaded_files: List[str], retur def assert_timestamp_data_type(load_storage: LoadStorage, data_type: TDataType) -> None: # load generated schema - loads = load_storage.list_packages() + loads = load_storage.list_normalized_packages() event_schema = load_storage.load_package_schema(loads[0]) # in raw normalize timestamp column must not be coerced to timestamp assert event_schema.get_table_columns("event")["timestamp"]["data_type"] == data_type diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 560a683709..c778e47cd6 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1219,3 +1219,29 @@ def test_empty_rows_are_included() -> None: values = [r[0] for r in rows] assert values == [1, None, None, None, None, None, None, None] + + +def test_remove_pending_packages() -> None: + pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy") + pipeline.extract(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False + pipeline.extract(airtable_emojis()) + pipeline.normalize() + pipeline.extract(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False + # partial load + os.environ["EXCEPTION_PROB"] = "1.0" + os.environ["FAIL_IN_INIT"] = "False" + os.environ["TIMEOUT"] = "1.0" + # should produce partial loads + with pytest.raises(PipelineStepFailed): + pipeline.run(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages(with_partial_loads=False) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False From 582448c013b33b9f590e4e06f4424b4ff741834b Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sat, 18 Nov 2023 18:44:07 +0100 Subject: [PATCH 5/6] bumps devel to pre-release 0.4.1a0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7307634cd6..627ec8344f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.3.24" +version = "0.4.1a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] From 105795c75847f0dbdbe3335642fb4fc8413e7183 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Sat, 18 Nov 2023 12:54:03 -0500 Subject: [PATCH 6/6] Parametrized destinations (#746) * Move destination modules to subfolder * Mockup destination factory * Destination factory replacing reference and dest __init__ * Update factories * Defer duckdb credentials resolving in pipeline context * Simplify destination config resolution * capabilities are callable * bigquery, athena factories * Add rest of factories * Cleanup * Destination type vars * Cleanup * Fix test * Create initial config from non-defaults only * Update naming convention path * Fix config in bigquery location test * Only keep non-default config args in factory * Resolve duckdb credentials in pipeline context * Cleanup * Union credentials arguments * Common tests without dest dependencies * Forward all athena arguments * Delete commented code * Reference docstrings * Add deprecation warning for credentials argument * Init docstrings for destination factories * Fix tests * Destination name in output * Correct exception in unknown destination test --------- Co-authored-by: Marcin Rudolf --- dlt/__init__.py | 2 + dlt/cli/deploy_command.py | 4 +- dlt/cli/init_command.py | 6 +- dlt/cli/pipeline_command.py | 2 +- dlt/common/configuration/inject.py | 16 ++- dlt/common/destination/__init__.py | 5 +- dlt/common/destination/reference.py | 135 ++++++++++++------ dlt/common/pipeline.py | 4 +- dlt/destinations/__init__.py | 28 ++++ dlt/destinations/filesystem/__init__.py | 29 ---- dlt/destinations/impl/__init__.py | 0 .../{ => impl}/athena/__init__.py | 23 --- dlt/destinations/{ => impl}/athena/athena.py | 4 +- .../{ => impl}/athena/configuration.py | 0 dlt/destinations/impl/athena/factory.py | 53 +++++++ .../{ => impl}/bigquery/README.md | 0 .../{ => impl}/bigquery/__init__.py | 24 ---- .../{ => impl}/bigquery/bigquery.py | 6 +- .../{ => impl}/bigquery/configuration.py | 0 dlt/destinations/impl/bigquery/factory.py | 35 +++++ .../{ => impl}/bigquery/sql_client.py | 2 +- .../{ => impl}/duckdb/__init__.py | 24 ---- .../{ => impl}/duckdb/configuration.py | 14 +- dlt/destinations/{ => impl}/duckdb/duck.py | 6 +- dlt/destinations/impl/duckdb/factory.py | 41 ++++++ .../{ => impl}/duckdb/sql_client.py | 4 +- dlt/destinations/{ => impl}/dummy/__init__.py | 17 +-- .../{ => impl}/dummy/configuration.py | 0 dlt/destinations/{ => impl}/dummy/dummy.py | 4 +- dlt/destinations/impl/dummy/factory.py | 30 ++++ dlt/destinations/impl/filesystem/__init__.py | 5 + .../{ => impl}/filesystem/configuration.py | 0 dlt/destinations/impl/filesystem/factory.py | 50 +++++++ .../{ => impl}/filesystem/filesystem.py | 4 +- .../{ => impl}/motherduck/__init__.py | 24 ---- .../{ => impl}/motherduck/configuration.py | 2 +- dlt/destinations/impl/motherduck/factory.py | 41 ++++++ .../{ => impl}/motherduck/motherduck.py | 8 +- .../{ => impl}/motherduck/sql_client.py | 6 +- dlt/destinations/{ => impl}/mssql/README.md | 0 dlt/destinations/{ => impl}/mssql/__init__.py | 24 ---- .../{ => impl}/mssql/configuration.py | 0 dlt/destinations/impl/mssql/factory.py | 41 ++++++ dlt/destinations/{ => impl}/mssql/mssql.py | 6 +- .../{ => impl}/mssql/sql_client.py | 4 +- .../{ => impl}/postgres/README.md | 0 .../{ => impl}/postgres/__init__.py | 20 --- .../{ => impl}/postgres/configuration.py | 0 dlt/destinations/impl/postgres/factory.py | 41 ++++++ .../{ => impl}/postgres/postgres.py | 6 +- .../{ => impl}/postgres/sql_client.py | 4 +- dlt/destinations/impl/qdrant/__init__.py | 18 +++ .../{ => impl}/qdrant/configuration.py | 0 dlt/destinations/impl/qdrant/factory.py | 30 ++++ .../{ => impl}/qdrant/qdrant_adapter.py | 0 .../{ => impl}/qdrant/qdrant_client.py | 8 +- .../{ => impl}/redshift/README.md | 0 .../{ => impl}/redshift/__init__.py | 24 ---- .../{ => impl}/redshift/configuration.py | 2 +- dlt/destinations/impl/redshift/factory.py | 45 ++++++ .../{ => impl}/redshift/redshift.py | 6 +- .../{ => impl}/snowflake/__init__.py | 23 --- .../{ => impl}/snowflake/configuration.py | 0 dlt/destinations/impl/snowflake/factory.py | 41 ++++++ .../{ => impl}/snowflake/snowflake.py | 8 +- .../{ => impl}/snowflake/sql_client.py | 4 +- .../{ => impl}/weaviate/README.md | 0 dlt/destinations/impl/weaviate/__init__.py | 19 +++ .../{ => impl}/weaviate/ci_naming.py | 0 .../{ => impl}/weaviate/configuration.py | 0 .../{ => impl}/weaviate/exceptions.py | 0 dlt/destinations/impl/weaviate/factory.py | 47 ++++++ .../{ => impl}/weaviate/naming.py | 0 .../{ => impl}/weaviate/weaviate_adapter.py | 0 .../{ => impl}/weaviate/weaviate_client.py | 8 +- dlt/destinations/qdrant/__init__.py | 53 ------- dlt/destinations/weaviate/__init__.py | 55 ------- dlt/helpers/streamlit_helper.py | 2 +- dlt/load/load.py | 7 +- dlt/pipeline/__init__.py | 11 +- dlt/pipeline/deprecations.py | 20 +++ dlt/pipeline/pipeline.py | 64 +++++---- dlt/pipeline/track.py | 8 +- .../dlt-ecosystem/destinations/weaviate.md | 2 +- docs/website/docs/getting-started-snippets.py | 2 +- tests/cli/test_pipeline_command.py | 2 +- .../common/data_writers/test_data_writers.py | 2 +- tests/common/test_destination.py | 17 +-- .../helpers/dbt_tests/local/test_dbt_utils.py | 2 +- .../dbt_tests/test_runner_dbt_versions.py | 4 +- tests/load/bigquery/test_bigquery_client.py | 4 +- .../bigquery/test_bigquery_table_builder.py | 4 +- tests/load/cases/fake_destination.py | 7 +- tests/load/duckdb/test_duckdb_client.py | 25 ++-- .../load/duckdb/test_duckdb_table_builder.py | 4 +- tests/load/duckdb/test_motherduck_client.py | 2 +- .../load/filesystem/test_filesystem_client.py | 2 +- tests/load/filesystem/utils.py | 8 +- tests/load/mssql/test_mssql_credentials.py | 2 +- tests/load/mssql/test_mssql_table_builder.py | 4 +- .../load/pipeline/test_filesystem_pipeline.py | 2 +- tests/load/pipeline/test_pipelines.py | 6 +- tests/load/pipeline/utils.py | 4 +- tests/load/postgres/test_postgres_client.py | 6 +- .../postgres/test_postgres_table_builder.py | 4 +- tests/load/qdrant/test_pipeline.py | 4 +- tests/load/qdrant/utils.py | 2 +- tests/load/redshift/test_redshift_client.py | 4 +- .../redshift/test_redshift_table_builder.py | 4 +- .../snowflake/test_snowflake_configuration.py | 2 +- .../snowflake/test_snowflake_table_builder.py | 4 +- tests/load/test_dummy_client.py | 8 +- tests/load/test_insert_job_client.py | 2 +- tests/load/utils.py | 12 +- tests/load/weaviate/test_naming.py | 4 +- tests/load/weaviate/test_pipeline.py | 10 +- tests/load/weaviate/test_weaviate_client.py | 13 +- tests/load/weaviate/utils.py | 4 +- tests/normalize/utils.py | 10 +- tests/pipeline/test_dlt_versions.py | 4 +- tests/pipeline/test_pipeline.py | 54 ++++++- tests/pipeline/test_pipeline_state.py | 4 +- tests/tools/clean_redshift.py | 4 +- 123 files changed, 978 insertions(+), 593 deletions(-) delete mode 100644 dlt/destinations/filesystem/__init__.py create mode 100644 dlt/destinations/impl/__init__.py rename dlt/destinations/{ => impl}/athena/__init__.py (55%) rename dlt/destinations/{ => impl}/athena/athena.py (99%) rename dlt/destinations/{ => impl}/athena/configuration.py (100%) create mode 100644 dlt/destinations/impl/athena/factory.py rename dlt/destinations/{ => impl}/bigquery/README.md (100%) rename dlt/destinations/{ => impl}/bigquery/__init__.py (50%) rename dlt/destinations/{ => impl}/bigquery/bigquery.py (98%) rename dlt/destinations/{ => impl}/bigquery/configuration.py (100%) create mode 100644 dlt/destinations/impl/bigquery/factory.py rename dlt/destinations/{ => impl}/bigquery/sql_client.py (99%) rename dlt/destinations/{ => impl}/duckdb/__init__.py (54%) rename dlt/destinations/{ => impl}/duckdb/configuration.py (94%) rename dlt/destinations/{ => impl}/duckdb/duck.py (96%) create mode 100644 dlt/destinations/impl/duckdb/factory.py rename dlt/destinations/{ => impl}/duckdb/sql_client.py (98%) rename dlt/destinations/{ => impl}/dummy/__init__.py (60%) rename dlt/destinations/{ => impl}/dummy/configuration.py (100%) rename dlt/destinations/{ => impl}/dummy/dummy.py (97%) create mode 100644 dlt/destinations/impl/dummy/factory.py create mode 100644 dlt/destinations/impl/filesystem/__init__.py rename dlt/destinations/{ => impl}/filesystem/configuration.py (100%) create mode 100644 dlt/destinations/impl/filesystem/factory.py rename dlt/destinations/{ => impl}/filesystem/filesystem.py (98%) rename dlt/destinations/{ => impl}/motherduck/__init__.py (51%) rename dlt/destinations/{ => impl}/motherduck/configuration.py (97%) create mode 100644 dlt/destinations/impl/motherduck/factory.py rename dlt/destinations/{ => impl}/motherduck/motherduck.py (70%) rename dlt/destinations/{ => impl}/motherduck/sql_client.py (83%) rename dlt/destinations/{ => impl}/mssql/README.md (100%) rename dlt/destinations/{ => impl}/mssql/__init__.py (57%) rename dlt/destinations/{ => impl}/mssql/configuration.py (100%) create mode 100644 dlt/destinations/impl/mssql/factory.py rename dlt/destinations/{ => impl}/mssql/mssql.py (97%) rename dlt/destinations/{ => impl}/mssql/sql_client.py (97%) rename dlt/destinations/{ => impl}/postgres/README.md (100%) rename dlt/destinations/{ => impl}/postgres/__init__.py (58%) rename dlt/destinations/{ => impl}/postgres/configuration.py (100%) create mode 100644 dlt/destinations/impl/postgres/factory.py rename dlt/destinations/{ => impl}/postgres/postgres.py (95%) rename dlt/destinations/{ => impl}/postgres/sql_client.py (97%) create mode 100644 dlt/destinations/impl/qdrant/__init__.py rename dlt/destinations/{ => impl}/qdrant/configuration.py (100%) create mode 100644 dlt/destinations/impl/qdrant/factory.py rename dlt/destinations/{ => impl}/qdrant/qdrant_adapter.py (100%) rename dlt/destinations/{ => impl}/qdrant/qdrant_client.py (98%) rename dlt/destinations/{ => impl}/redshift/README.md (100%) rename dlt/destinations/{ => impl}/redshift/__init__.py (52%) rename dlt/destinations/{ => impl}/redshift/configuration.py (88%) create mode 100644 dlt/destinations/impl/redshift/factory.py rename dlt/destinations/{ => impl}/redshift/redshift.py (97%) rename dlt/destinations/{ => impl}/snowflake/__init__.py (52%) rename dlt/destinations/{ => impl}/snowflake/configuration.py (100%) create mode 100644 dlt/destinations/impl/snowflake/factory.py rename dlt/destinations/{ => impl}/snowflake/snowflake.py (97%) rename dlt/destinations/{ => impl}/snowflake/sql_client.py (98%) rename dlt/destinations/{ => impl}/weaviate/README.md (100%) create mode 100644 dlt/destinations/impl/weaviate/__init__.py rename dlt/destinations/{ => impl}/weaviate/ci_naming.py (100%) rename dlt/destinations/{ => impl}/weaviate/configuration.py (100%) rename dlt/destinations/{ => impl}/weaviate/exceptions.py (100%) create mode 100644 dlt/destinations/impl/weaviate/factory.py rename dlt/destinations/{ => impl}/weaviate/naming.py (100%) rename dlt/destinations/{ => impl}/weaviate/weaviate_adapter.py (100%) rename dlt/destinations/{ => impl}/weaviate/weaviate_client.py (98%) delete mode 100644 dlt/destinations/qdrant/__init__.py delete mode 100644 dlt/destinations/weaviate/__init__.py create mode 100644 dlt/pipeline/deprecations.py diff --git a/dlt/__init__.py b/dlt/__init__.py index f5dde3f204..728343bdd6 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -31,6 +31,7 @@ from dlt.extract.decorators import source, resource, transformer, defer from dlt.pipeline import pipeline as _pipeline, run, attach, Pipeline, dbt, current as _current, mark as _mark from dlt.pipeline import progress +from dlt import destinations pipeline = _pipeline current = _current @@ -64,4 +65,5 @@ "TSecretValue", "TCredentials", "sources", + "destinations", ] diff --git a/dlt/cli/deploy_command.py b/dlt/cli/deploy_command.py index 7634f173b3..a7bdf2e0e7 100644 --- a/dlt/cli/deploy_command.py +++ b/dlt/cli/deploy_command.py @@ -16,7 +16,7 @@ from dlt.version import DLT_PKG_NAME -from dlt.common.destination.reference import DestinationReference +from dlt.common.destination.reference import Destination REQUIREMENTS_GITHUB_ACTION = "requirements_github_action.txt" DLT_DEPLOY_DOCS_URL = "https://dlthub.com/docs/walkthroughs/deploy-a-pipeline" @@ -198,7 +198,7 @@ def __init__( def _generate_workflow(self, *args: Optional[Any]) -> None: self.deployment_method = DeploymentMethods.airflow_composer.value - req_dep = f"{DLT_PKG_NAME}[{DestinationReference.to_name(self.state['destination'])}]" + req_dep = f"{DLT_PKG_NAME}[{Destination.to_name(self.state['destination'])}]" req_dep_line = f"{req_dep}>={pkg_version(DLT_PKG_NAME)}" self.artifacts["requirements_txt"] = req_dep_line diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py index c246ac87de..4cec1706b9 100644 --- a/dlt/cli/init_command.py +++ b/dlt/cli/init_command.py @@ -12,7 +12,7 @@ from dlt.common.pipeline import get_dlt_repos_dir from dlt.common.source import _SOURCES from dlt.version import DLT_PKG_NAME, __version__ -from dlt.common.destination import DestinationReference +from dlt.common.destination import Destination from dlt.common.reflection.utils import rewrite_python_script from dlt.common.schema.utils import is_valid_schema_name from dlt.common.schema.exceptions import InvalidSchemaName @@ -160,8 +160,8 @@ def list_verified_sources_command(repo_location: str, branch: str = None) -> Non def init_command(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str = None) -> None: # try to import the destination and get config spec - destination_reference = DestinationReference.from_name(destination_name) - destination_spec = destination_reference.spec() + destination_reference = Destination.from_reference(destination_name) + destination_spec = destination_reference.spec fmt.echo("Looking up the init scripts in %s..." % fmt.bold(repo_location)) clone_storage = git.get_fresh_repo_files(repo_location, get_dlt_repos_dir(), branch=branch) diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index b17981c1b1..2d705dc1a3 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -212,7 +212,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.warning(warning) return - fmt.echo("About to drop the following data in dataset %s in destination %s:" % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.__name__))) + fmt.echo("About to drop the following data in dataset %s in destination %s:" % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.name))) fmt.echo("%s: %s" % (fmt.style("Selected schema", fg="green"), drop.info["schema_name"])) fmt.echo("%s: %s" % (fmt.style("Selected resource(s)", fg="green"), drop.info["resource_names"])) fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"])) diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index 1880727a0f..f50e947011 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -32,7 +32,8 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, ) -> TFun: ... @@ -45,7 +46,8 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, ) -> Callable[[TFun], TFun]: ... @@ -57,7 +59,9 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, + initial_config: Optional[BaseConfiguration] = None, ) -> Callable[[TFun], TFun]: """Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. @@ -127,7 +131,9 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: curr_sections = sections # if one of arguments is spec the use it as initial value - if spec_arg: + if initial_config: + config = initial_config + elif spec_arg: config = bound_args.arguments.get(spec_arg.name, None) # resolve SPEC, also provide section_context with pipeline_name if pipeline_name_arg: @@ -139,7 +145,7 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: with _RESOLVE_LOCK: with inject_section(section_context): # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") - config = resolve_configuration(config or SPEC(), explicit_value=bound_args.arguments) + config = resolve_configuration(config or SPEC(), explicit_value=bound_args.arguments, accept_partial=accept_partial) resolved_params = dict(config) # overwrite or add resolved params for p in sig.parameters.values(): diff --git a/dlt/common/destination/__init__.py b/dlt/common/destination/__init__.py index 88b5d5ef06..4857851fa9 100644 --- a/dlt/common/destination/__init__.py +++ b/dlt/common/destination/__init__.py @@ -1,10 +1,11 @@ from dlt.common.destination.capabilities import DestinationCapabilitiesContext, TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS -from dlt.common.destination.reference import DestinationReference, TDestinationReferenceArg +from dlt.common.destination.reference import TDestinationReferenceArg, Destination, TDestination __all__ = [ "DestinationCapabilitiesContext", "TLoaderFileFormat", "ALL_SUPPORTED_FILE_FORMATS", - "DestinationReference", "TDestinationReferenceArg", + "Destination", + "TDestination", ] diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 13172b41e9..1c3560cbbd 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -1,10 +1,11 @@ from abc import ABC, abstractmethod, abstractproperty from importlib import import_module from types import TracebackType, ModuleType -from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any +from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any, Callable, TypeVar, Generic from contextlib import contextmanager import datetime # noqa: 251 from copy import deepcopy +import inspect from dlt.common import logger from dlt.common.exceptions import IdentifierTooLongException, InvalidDestinationReference, UnknownDestinationModule @@ -12,7 +13,7 @@ from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName from dlt.common.schema.utils import get_write_disposition, get_table_format -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, with_config, resolve_configuration, known_sections from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext @@ -23,7 +24,10 @@ from dlt.common.utils import get_module_name from dlt.common.configuration.specs import GcpCredentials, AwsCredentialsWithoutDefaults + TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] +TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") +TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") class StorageSchemaInfo(NamedTuple): @@ -344,59 +348,102 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable # the default is to truncate the tables on the staging destination... return True -TDestinationReferenceArg = Union["DestinationReference", ModuleType, None, str] +TDestinationReferenceArg = Union[str, "Destination", None] -class DestinationReference(Protocol): - __name__: str - """Name of the destination""" +class Destination(ABC, Generic[TDestinationConfig, TDestinationClient]): + """A destination factory that can be partially pre-configured + with credentials and other config params. + """ + config_params: Optional[Dict[str, Any]] = None + + def __init__(self, **kwargs: Any) -> None: + # Create initial unresolved destination config + # Argument defaults are filtered out here because we only want arguments passed explicitly + # to supersede config from the environment or pipeline args + sig = inspect.signature(self.__class__) + params = sig.parameters + self.config_params = { + k: v for k, v in kwargs.items() + if k not in params or v != params[k].default + } + + @property + @abstractmethod + def spec(self) -> Type[TDestinationConfig]: + """A spec of destination configuration that also contains destination credentials""" + ... + @abstractmethod def capabilities(self) -> DestinationCapabilitiesContext: """Destination capabilities ie. supported loader file formats, identifier name lengths, naming conventions, escape function etc.""" + ... - def client(self, schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> "JobClientBase": - """A job client responsible for starting and resuming load jobs""" + @property + def name(self) -> str: + return self.__class__.__name__ - def spec(self) -> Type[DestinationClientConfiguration]: - """A spec of destination configuration that also contains destination credentials""" + @property + @abstractmethod + def client_class(self) -> Type[TDestinationClient]: + """A job client class responsible for starting and resuming load jobs""" + ... + + def configuration(self, initial_config: TDestinationConfig) -> TDestinationConfig: + """Get a fully resolved destination config from the initial config + """ + return resolve_configuration( + initial_config, + sections=(known_sections.DESTINATION, self.name), + # Already populated values will supersede resolved env config + explicit_value=self.config_params + ) + + @staticmethod + def to_name(ref: TDestinationReferenceArg) -> str: + if ref is None: + raise InvalidDestinationReference(ref) + if isinstance(ref, str): + return ref.rsplit(".", 1)[-1] + return ref.name @staticmethod - def from_name(destination: TDestinationReferenceArg) -> "DestinationReference": - if destination is None: + def from_reference(ref: TDestinationReferenceArg, credentials: Optional[CredentialsConfiguration] = None, **kwargs: Any) -> Optional["Destination[DestinationClientConfiguration, JobClientBase]"]: + """Instantiate destination from str reference. + The ref can be a destination name or import path pointing to a destination class (e.g. `dlt.destinations.postgres`) + """ + if ref is None: return None + if isinstance(ref, Destination): + return ref + if not isinstance(ref, str): + raise InvalidDestinationReference(ref) + try: + if "." in ref: + module_path, attr_name = ref.rsplit(".", 1) + dest_module = import_module(module_path) + else: + from dlt import destinations as dest_module + attr_name = ref + except ModuleNotFoundError as e: + raise UnknownDestinationModule(ref) from e - # if destination is a str, get destination reference by dynamically importing module - if isinstance(destination, str): - try: - if "." in destination: - # this is full module name - destination_ref = cast(DestinationReference, import_module(destination)) - else: - # from known location - destination_ref = cast(DestinationReference, import_module(f"dlt.destinations.{destination}")) - except ImportError: - if "." in destination: - raise UnknownDestinationModule(destination) - else: - # allow local external module imported without dot - try: - destination_ref = cast(DestinationReference, import_module(destination)) - except ImportError: - raise UnknownDestinationModule(destination) - else: - destination_ref = cast(DestinationReference, destination) - - # make sure the reference is correct try: - c = destination_ref.spec() - c.credentials - except Exception: - raise InvalidDestinationReference(destination) + factory: Type[Destination[DestinationClientConfiguration, JobClientBase]] = getattr(dest_module, attr_name) + except AttributeError as e: + raise UnknownDestinationModule(ref) from e + if credentials: + kwargs["credentials"] = credentials + try: + dest = factory(**kwargs) + dest.spec + except Exception as e: + raise InvalidDestinationReference(ref) from e + return dest - return destination_ref + def client(self, schema: Schema, initial_config: TDestinationConfig = config.value) -> TDestinationClient: + """Returns a configured instance of the destination's job client""" + return self.client_class(schema, self.configuration(initial_config)) - @staticmethod - def to_name(destination: TDestinationReferenceArg) -> str: - if isinstance(destination, ModuleType): - return get_module_name(destination) - return destination.split(".")[-1] # type: ignore + +TDestination = Destination[DestinationClientConfiguration, JobClientBase] diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index aeb0bdc68a..ddd9003799 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -14,7 +14,7 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.configuration.specs import RunConfiguration -from dlt.common.destination import DestinationReference, TDestinationReferenceArg +from dlt.common.destination import Destination, TDestinationReferenceArg, TDestination from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition @@ -177,7 +177,7 @@ class SupportsPipeline(Protocol): """Name of the pipeline""" default_schema_name: str """Name of the default schema""" - destination: DestinationReference + destination: TDestination """The destination reference which is ModuleType. `destination.__name__` returns the name string""" dataset_name: str """Name of the dataset to which pipeline will be loaded to""" diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index e69de29bb2..980c4ce7f2 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -0,0 +1,28 @@ +from dlt.destinations.impl.postgres.factory import postgres +from dlt.destinations.impl.snowflake.factory import snowflake +from dlt.destinations.impl.filesystem.factory import filesystem +from dlt.destinations.impl.duckdb.factory import duckdb +from dlt.destinations.impl.dummy.factory import dummy +from dlt.destinations.impl.mssql.factory import mssql +from dlt.destinations.impl.bigquery.factory import bigquery +from dlt.destinations.impl.athena.factory import athena +from dlt.destinations.impl.redshift.factory import redshift +from dlt.destinations.impl.qdrant.factory import qdrant +from dlt.destinations.impl.motherduck.factory import motherduck +from dlt.destinations.impl.weaviate.factory import weaviate + + +__all__ = [ + "postgres", + "snowflake", + "filesystem", + "duckdb", + "dummy", + "mssql", + "bigquery", + "athena", + "redshift", + "qdrant", + "motherduck", + "weaviate", +] diff --git a/dlt/destinations/filesystem/__init__.py b/dlt/destinations/filesystem/__init__.py deleted file mode 100644 index 3dc6c62480..0000000000 --- a/dlt/destinations/filesystem/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientDwhWithStagingConfiguration - -from dlt.destinations.filesystem.configuration import FilesystemDestinationClientConfiguration - - -@with_config(spec=FilesystemDestinationClientConfiguration, sections=(known_sections.DESTINATION, "filesystem",)) -def _configure(config: FilesystemDestinationClientConfiguration = config.value) -> FilesystemDestinationClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - return DestinationCapabilitiesContext.generic_capabilities("jsonl") - - -def client(schema: Schema, initial_config: DestinationClientDwhWithStagingConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.filesystem.filesystem import FilesystemClient - - return FilesystemClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[FilesystemDestinationClientConfiguration]: - return FilesystemDestinationClientConfiguration diff --git a/dlt/destinations/impl/__init__.py b/dlt/destinations/impl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/destinations/athena/__init__.py b/dlt/destinations/impl/athena/__init__.py similarity index 55% rename from dlt/destinations/athena/__init__.py rename to dlt/destinations/impl/athena/__init__.py index 1fd7f14d57..9f0b829819 100644 --- a/dlt/destinations/athena/__init__.py +++ b/dlt/destinations/impl/athena/__init__.py @@ -1,18 +1,7 @@ -from typing import Type - from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.schema.schema import Schema from dlt.common.data_writers.escape import escape_athena_identifier from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.athena.configuration import AthenaClientConfiguration -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration - -@with_config(spec=AthenaClientConfiguration, sections=(known_sections.DESTINATION, "athena",)) -def _configure(config: AthenaClientConfiguration = config.value) -> AthenaClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -37,15 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.timestamp_precision = 3 caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.athena.athena import AthenaClient - return AthenaClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return AthenaClientConfiguration - - diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/impl/athena/athena.py similarity index 99% rename from dlt/destinations/athena/athena.py rename to dlt/destinations/impl/athena/athena.py index 44d020c127..f675e7a496 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -27,11 +27,11 @@ from dlt.destinations.typing import DBApi, DBTransaction from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException -from dlt.destinations.athena import capabilities +from dlt.destinations.impl.athena import capabilities from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error from dlt.destinations.typing import DBApiCursor from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.athena.configuration import AthenaClientConfiguration +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration from dlt.destinations.type_mapping import TypeMapper from dlt.destinations import path_utils diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py similarity index 100% rename from dlt/destinations/athena/configuration.py rename to dlt/destinations/impl/athena/configuration.py diff --git a/dlt/destinations/impl/athena/factory.py b/dlt/destinations/impl/athena/factory.py new file mode 100644 index 0000000000..cc2b027695 --- /dev/null +++ b/dlt/destinations/impl/athena/factory.py @@ -0,0 +1,53 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration +from dlt.common.configuration.specs import AwsCredentials +from dlt.destinations.impl.athena import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.athena.athena import AthenaClient + + +class athena(Destination[AthenaClientConfiguration, "AthenaClient"]): + + spec = AthenaClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["AthenaClient"]: + from dlt.destinations.impl.athena.athena import AthenaClient + + return AthenaClient + + def __init__( + self, + query_result_bucket: t.Optional[str] = None, + credentials: t.Union[AwsCredentials, t.Dict[str, t.Any], t.Any] = None, + athena_work_group: t.Optional[str] = None, + aws_data_catalog: t.Optional[str] = "awsdatacatalog", + force_iceberg: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the Athena destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + query_result_bucket: S3 bucket to store query results in + credentials: AWS credentials to connect to the Athena database. + athena_work_group: Athena work group to use + aws_data_catalog: Athena data catalog to use + force_iceberg: Force iceberg tables + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + query_result_bucket=query_result_bucket, + credentials=credentials, + athena_work_group=athena_work_group, + aws_data_catalog=aws_data_catalog, + force_iceberg=force_iceberg, + **kwargs, + ) diff --git a/dlt/destinations/bigquery/README.md b/dlt/destinations/impl/bigquery/README.md similarity index 100% rename from dlt/destinations/bigquery/README.md rename to dlt/destinations/impl/bigquery/README.md diff --git a/dlt/destinations/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py similarity index 50% rename from dlt/destinations/bigquery/__init__.py rename to dlt/destinations/impl/bigquery/__init__.py index 3d97e9a929..1304bd72bb 100644 --- a/dlt/destinations/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -1,20 +1,7 @@ -from typing import Type from dlt.common.data_writers.escape import escape_bigquery_identifier - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration - - -@with_config(spec=BigQueryClientConfiguration, sections=(known_sections.DESTINATION, "bigquery",)) -def _configure(config: BigQueryClientConfiguration = config.value) -> BigQueryClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -35,14 +22,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.bigquery.bigquery import BigQueryClient - - return BigQueryClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return BigQueryClientConfiguration \ No newline at end of file diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py similarity index 98% rename from dlt/destinations/bigquery/bigquery.py rename to dlt/destinations/impl/bigquery/bigquery.py index 9cc7591f57..440123e46d 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -17,9 +17,9 @@ from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException -from dlt.destinations.bigquery import capabilities -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration -from dlt.destinations.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS +from dlt.destinations.impl.bigquery import capabilities +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase diff --git a/dlt/destinations/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py similarity index 100% rename from dlt/destinations/bigquery/configuration.py rename to dlt/destinations/impl/bigquery/configuration.py diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py new file mode 100644 index 0000000000..ce6ace3bf7 --- /dev/null +++ b/dlt/destinations/impl/bigquery/factory.py @@ -0,0 +1,35 @@ +import typing as t + +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from dlt.destinations.impl.bigquery import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +if t.TYPE_CHECKING: + from dlt.destinations.impl.bigquery.bigquery import BigQueryClient + + +class bigquery(Destination[BigQueryClientConfiguration, "BigQueryClient"]): + + spec = BigQueryClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["BigQueryClient"]: + from dlt.destinations.impl.bigquery.bigquery import BigQueryClient + + return BigQueryClient + + def __init__( + self, + credentials: t.Optional[GcpServiceAccountCredentials] = None, + location: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + super().__init__( + credentials=credentials, + location=location, + **kwargs + ) diff --git a/dlt/destinations/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py similarity index 99% rename from dlt/destinations/bigquery/sql_client.py rename to dlt/destinations/impl/bigquery/sql_client.py index 3d6eb19833..4939add0da 100644 --- a/dlt/destinations/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -17,7 +17,7 @@ from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.bigquery import capabilities +from dlt.destinations.impl.bigquery import capabilities # terminal reasons as returned in BQ gRPC error response # https://cloud.google.com/bigquery/docs/error-messages diff --git a/dlt/destinations/duckdb/__init__.py b/dlt/destinations/impl/duckdb/__init__.py similarity index 54% rename from dlt/destinations/duckdb/__init__.py rename to dlt/destinations/impl/duckdb/__init__.py index d9882cc0eb..5cbc8dea53 100644 --- a/dlt/destinations/duckdb/__init__.py +++ b/dlt/destinations/impl/duckdb/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration - - -@with_config(spec=DuckDbClientConfiguration, sections=(known_sections.DESTINATION, "duckdb",)) -def _configure(config: DuckDbClientConfiguration = config.value) -> DuckDbClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -37,14 +24,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.duckdb.duck import DuckDbClient - - return DuckDbClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return DuckDbClientConfiguration diff --git a/dlt/destinations/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py similarity index 94% rename from dlt/destinations/duckdb/configuration.py rename to dlt/destinations/impl/duckdb/configuration.py index 82ee325ed3..a5f77be8fd 100644 --- a/dlt/destinations/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -25,6 +25,7 @@ class DuckDbBaseCredentials(ConnectionStringCredentials): read_only: bool = False # open database read/write def borrow_conn(self, read_only: bool) -> Any: + # TODO: Can this be done in sql client instead? import duckdb if not hasattr(self, "_conn_lock"): @@ -95,6 +96,13 @@ class DuckDbCredentials(DuckDbBaseCredentials): __config_gen_annotations__: ClassVar[List[str]] = [] + def is_partial(self) -> bool: + partial = super().is_partial() + if partial: + return True + # Wait until pipeline context is set up before resolving + return self.database == ":pipeline:" + def on_resolved(self) -> None: # do not set any paths for external database if self.database == ":external:": @@ -126,8 +134,7 @@ def _path_in_pipeline(self, rel_path: str) -> str: if context.is_active(): # pipeline is active, get the working directory return os.path.join(context.pipeline().working_dir, rel_path) - return None - + raise RuntimeError("Attempting to use special duckdb database :pipeline: outside of pipeline context.") def _path_to_pipeline(self, abspath: str) -> None: from dlt.common.configuration.container import Container @@ -173,6 +180,9 @@ def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]: return default_path, True + def _conn_str(self) -> str: + return self.database + @configspec class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration): diff --git a/dlt/destinations/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py similarity index 96% rename from dlt/destinations/duckdb/duck.py rename to dlt/destinations/impl/duckdb/duck.py index 4a2e54f2b6..6e6ec359fe 100644 --- a/dlt/destinations/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -12,9 +12,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.duckdb import capabilities -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb import capabilities +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py new file mode 100644 index 0000000000..1b882c52a1 --- /dev/null +++ b/dlt/destinations/impl/duckdb/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.duckdb.configuration import DuckDbCredentials, DuckDbClientConfiguration +from dlt.destinations.impl.duckdb import capabilities + +if t.TYPE_CHECKING: + from duckdb import DuckDBPyConnection + from dlt.destinations.impl.duckdb.duck import DuckDbClient + + +class duckdb(Destination[DuckDbClientConfiguration, "DuckDbClient"]): + + spec = DuckDbClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["DuckDbClient"]: + from dlt.destinations.impl.duckdb.duck import DuckDbClient + + return DuckDbClient + + def __init__( + self, + credentials: t.Union[DuckDbCredentials, t.Dict[str, t.Any], str, "DuckDBPyConnection"] = None, + create_indexes: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the DuckDB destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the duckdb database. Can be an instance of `DuckDbCredentials` or + a path to a database file. Use `:memory:` to create an in-memory database. + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py similarity index 98% rename from dlt/destinations/duckdb/sql_client.py rename to dlt/destinations/impl/duckdb/sql_client.py index cd2160f676..cb4e1678a2 100644 --- a/dlt/destinations/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -8,8 +8,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error -from dlt.destinations.duckdb import capabilities -from dlt.destinations.duckdb.configuration import DuckDbBaseCredentials +from dlt.destinations.impl.duckdb import capabilities +from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials class DuckDBDBApiCursorImpl(DBApiCursorImpl): diff --git a/dlt/destinations/dummy/__init__.py b/dlt/destinations/impl/dummy/__init__.py similarity index 60% rename from dlt/destinations/dummy/__init__.py rename to dlt/destinations/impl/dummy/__init__.py index 7131f0109a..476523cb8f 100644 --- a/dlt/destinations/dummy/__init__.py +++ b/dlt/destinations/impl/dummy/__init__.py @@ -1,12 +1,8 @@ -from typing import Type - -from dlt.common.schema.schema import Schema from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration @with_config(spec=DummyClientConfiguration, sections=(known_sections.DESTINATION, "dummy",)) @@ -30,14 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.dummy.dummy import DummyClient - - return DummyClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return DummyClientConfiguration diff --git a/dlt/destinations/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py similarity index 100% rename from dlt/destinations/dummy/configuration.py rename to dlt/destinations/impl/dummy/configuration.py diff --git a/dlt/destinations/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py similarity index 97% rename from dlt/destinations/dummy/dummy.py rename to dlt/destinations/impl/dummy/dummy.py index 92827405ca..0bc061a7dd 100644 --- a/dlt/destinations/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -13,8 +13,8 @@ from dlt.destinations.exceptions import (LoadJobNotExistsException, LoadJobInvalidStateTransitionException, DestinationTerminalException, DestinationTransientException) -from dlt.destinations.dummy import capabilities -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy import capabilities +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration class LoadDummyJob(LoadJob, FollowupJob): diff --git a/dlt/destinations/impl/dummy/factory.py b/dlt/destinations/impl/dummy/factory.py new file mode 100644 index 0000000000..265c77b0f4 --- /dev/null +++ b/dlt/destinations/impl/dummy/factory.py @@ -0,0 +1,30 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration, DummyClientCredentials +from dlt.destinations.impl.dummy import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.dummy.dummy import DummyClient + + +class dummy(Destination[DummyClientConfiguration, "DummyClient"]): + + spec = DummyClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["DummyClient"]: + from dlt.destinations.impl.dummy.dummy import DummyClient + + return DummyClient + + def __init__( + self, + credentials: DummyClientCredentials = None, + **kwargs: t.Any, + ) -> None: + super().__init__(credentials=credentials, **kwargs) diff --git a/dlt/destinations/impl/filesystem/__init__.py b/dlt/destinations/impl/filesystem/__init__.py new file mode 100644 index 0000000000..12e83216cf --- /dev/null +++ b/dlt/destinations/impl/filesystem/__init__.py @@ -0,0 +1,5 @@ +from dlt.common.destination import DestinationCapabilitiesContext + + +def capabilities() -> DestinationCapabilitiesContext: + return DestinationCapabilitiesContext.generic_capabilities("jsonl") diff --git a/dlt/destinations/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py similarity index 100% rename from dlt/destinations/filesystem/configuration.py rename to dlt/destinations/impl/filesystem/configuration.py diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py new file mode 100644 index 0000000000..4e2a716d79 --- /dev/null +++ b/dlt/destinations/impl/filesystem/factory.py @@ -0,0 +1,50 @@ +import typing as t + +from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.storages.configuration import FileSystemCredentials + +if t.TYPE_CHECKING: + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +class filesystem(Destination[FilesystemDestinationClientConfiguration, "FilesystemClient"]): + + spec = FilesystemDestinationClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["FilesystemClient"]: + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + return FilesystemClient + + def __init__( + self, + bucket_url: str = None, + credentials: t.Union[FileSystemCredentials, t.Dict[str, t.Any], t.Any] = None, + **kwargs: t.Any, + ) -> None: + """Configure the filesystem destination to use in a pipeline and load data to local or remote filesystem. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + The `bucket_url` determines the protocol to be used: + + - Local folder: `file:///path/to/directory` + - AWS S3 (and S3 compatible storages): `s3://bucket-name + - Azure Blob Storage: `az://container-name + - Google Cloud Storage: `gs://bucket-name + - Memory fs: `memory://m` + + Args: + bucket_url: The fsspec compatible bucket url to use for the destination. + credentials: Credentials to connect to the filesystem. The type of credentials should correspond to + the bucket protocol. For example, for AWS S3, the credentials should be an instance of `AwsCredentials`. + A dictionary with the credentials parameters can also be provided. + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(bucket_url=bucket_url, credentials=credentials, **kwargs) diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py similarity index 98% rename from dlt/destinations/filesystem/filesystem.py rename to dlt/destinations/impl/filesystem/filesystem.py index 766f384024..fe349aac6b 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -12,8 +12,8 @@ from dlt.common.destination.reference import NewLoadJob, TLoadJobState, LoadJob, JobClientBase, FollowupJob, WithStagingDataset from dlt.destinations.job_impl import EmptyLoadJob -from dlt.destinations.filesystem import capabilities -from dlt.destinations.filesystem.configuration import FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem import capabilities +from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations import path_utils diff --git a/dlt/destinations/motherduck/__init__.py b/dlt/destinations/impl/motherduck/__init__.py similarity index 51% rename from dlt/destinations/motherduck/__init__.py rename to dlt/destinations/impl/motherduck/__init__.py index eae67eaa74..74c0e36ef3 100644 --- a/dlt/destinations/motherduck/__init__.py +++ b/dlt/destinations/impl/motherduck/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.motherduck.configuration import MotherDuckClientConfiguration - - -@with_config(spec=MotherDuckClientConfiguration, sections=(known_sections.DESTINATION, "motherduck",)) -def _configure(config: MotherDuckClientConfiguration = config.value) -> MotherDuckClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -35,14 +22,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.motherduck.motherduck import MotherDuckClient - - return MotherDuckClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return MotherDuckClientConfiguration diff --git a/dlt/destinations/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py similarity index 97% rename from dlt/destinations/motherduck/configuration.py rename to dlt/destinations/impl/motherduck/configuration.py index 18d480c945..a376f1a5aa 100644 --- a/dlt/destinations/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -7,7 +7,7 @@ from dlt.common.utils import digest128 from dlt.common.configuration.exceptions import ConfigurationValueError -from dlt.destinations.duckdb.configuration import DuckDbBaseCredentials +from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials MOTHERDUCK_DRIVERNAME = "md" diff --git a/dlt/destinations/impl/motherduck/factory.py b/dlt/destinations/impl/motherduck/factory.py new file mode 100644 index 0000000000..17cf4a76b4 --- /dev/null +++ b/dlt/destinations/impl/motherduck/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck import capabilities + +if t.TYPE_CHECKING: + from duckdb import DuckDBPyConnection + from dlt.destinations.impl.motherduck.motherduck import MotherDuckClient + + +class motherduck(Destination[MotherDuckClientConfiguration, "MotherDuckClient"]): + + spec = MotherDuckClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["MotherDuckClient"]: + from dlt.destinations.impl.motherduck.motherduck import MotherDuckClient + + return MotherDuckClient + + def __init__( + self, + credentials: t.Union[MotherDuckCredentials, str, t.Dict[str, t.Any], "DuckDBPyConnection"] = None, + create_indexes: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the MotherDuck destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the MotherDuck database. Can be an instance of `MotherDuckCredentials` or + a connection string in the format `md:///?token=` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/motherduck/motherduck.py b/dlt/destinations/impl/motherduck/motherduck.py similarity index 70% rename from dlt/destinations/motherduck/motherduck.py rename to dlt/destinations/impl/motherduck/motherduck.py index 93c0ed163b..9822f2b7b6 100644 --- a/dlt/destinations/motherduck/motherduck.py +++ b/dlt/destinations/impl/motherduck/motherduck.py @@ -4,10 +4,10 @@ from dlt.common.schema import Schema -from dlt.destinations.duckdb.duck import DuckDbClient -from dlt.destinations.motherduck import capabilities -from dlt.destinations.motherduck.sql_client import MotherDuckSqlClient -from dlt.destinations.motherduck.configuration import MotherDuckClientConfiguration +from dlt.destinations.impl.duckdb.duck import DuckDbClient +from dlt.destinations.impl.motherduck import capabilities +from dlt.destinations.impl.motherduck.sql_client import MotherDuckSqlClient +from dlt.destinations.impl.motherduck.configuration import MotherDuckClientConfiguration class MotherDuckClient(DuckDbClient): diff --git a/dlt/destinations/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py similarity index 83% rename from dlt/destinations/motherduck/sql_client.py rename to dlt/destinations/impl/motherduck/sql_client.py index 2fc664a2e8..672c377fd9 100644 --- a/dlt/destinations/motherduck/sql_client.py +++ b/dlt/destinations/impl/motherduck/sql_client.py @@ -8,9 +8,9 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl -from dlt.destinations.motherduck import capabilities -from dlt.destinations.motherduck.configuration import MotherDuckCredentials +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl +from dlt.destinations.impl.motherduck import capabilities +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials class MotherDuckSqlClient(DuckDbSqlClient): diff --git a/dlt/destinations/mssql/README.md b/dlt/destinations/impl/mssql/README.md similarity index 100% rename from dlt/destinations/mssql/README.md rename to dlt/destinations/impl/mssql/README.md diff --git a/dlt/destinations/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py similarity index 57% rename from dlt/destinations/mssql/__init__.py rename to dlt/destinations/impl/mssql/__init__.py index 56051a324e..40e971cacf 100644 --- a/dlt/destinations/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -1,21 +1,8 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration - - -@with_config(spec=MsSqlClientConfiguration, sections=(known_sections.DESTINATION, "mssql",)) -def _configure(config: MsSqlClientConfiguration = config.value) -> MsSqlClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -39,14 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.timestamp_precision = 7 return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.mssql.mssql import MsSqlClient - - return MsSqlClient(schema, _configure(initial_config)) # type: ignore[arg-type] - - -def spec() -> Type[DestinationClientConfiguration]: - return MsSqlClientConfiguration diff --git a/dlt/destinations/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py similarity index 100% rename from dlt/destinations/mssql/configuration.py rename to dlt/destinations/impl/mssql/configuration.py diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py new file mode 100644 index 0000000000..c98531ca79 --- /dev/null +++ b/dlt/destinations/impl/mssql/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, MsSqlClientConfiguration +from dlt.destinations.impl.mssql import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.mssql.mssql import MsSqlClient + + +class mssql(Destination[MsSqlClientConfiguration, "MsSqlClient"]): + + spec = MsSqlClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["MsSqlClient"]: + from dlt.destinations.impl.mssql.mssql import MsSqlClient + + return MsSqlClient + + def __init__( + self, + credentials: t.Union[MsSqlCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the MsSql destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the mssql database. Can be an instance of `MsSqlCredentials` or + a connection string in the format `mssql://user:password@host:port/database` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py similarity index 97% rename from dlt/destinations/mssql/mssql.py rename to dlt/destinations/impl/mssql/mssql.py index cd999441ff..851122f20c 100644 --- a/dlt/destinations/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -12,9 +12,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.mssql import capabilities -from dlt.destinations.mssql.sql_client import PyOdbcMsSqlClient -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration +from dlt.destinations.impl.mssql import capabilities +from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py similarity index 97% rename from dlt/destinations/mssql/sql_client.py rename to dlt/destinations/impl/mssql/sql_client.py index 4dd983a334..5372fa3626 100644 --- a/dlt/destinations/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -13,8 +13,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.mssql.configuration import MsSqlCredentials -from dlt.destinations.mssql import capabilities +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.mssql import capabilities def handle_datetimeoffset(dto_value: bytes) -> datetime: diff --git a/dlt/destinations/postgres/README.md b/dlt/destinations/impl/postgres/README.md similarity index 100% rename from dlt/destinations/postgres/README.md rename to dlt/destinations/impl/postgres/README.md diff --git a/dlt/destinations/postgres/__init__.py b/dlt/destinations/impl/postgres/__init__.py similarity index 58% rename from dlt/destinations/postgres/__init__.py rename to dlt/destinations/impl/postgres/__init__.py index e8904c075f..009174ecc9 100644 --- a/dlt/destinations/postgres/__init__.py +++ b/dlt/destinations/impl/postgres/__init__.py @@ -1,20 +1,9 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION -from dlt.destinations.postgres.configuration import PostgresClientConfiguration - - -@with_config(spec=PostgresClientConfiguration, sections=(known_sections.DESTINATION, "postgres",)) -def _configure(config: PostgresClientConfiguration = config.value) -> PostgresClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: @@ -39,12 +28,3 @@ def capabilities() -> DestinationCapabilitiesContext: return caps -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.postgres.postgres import PostgresClient - - return PostgresClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return PostgresClientConfiguration diff --git a/dlt/destinations/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py similarity index 100% rename from dlt/destinations/postgres/configuration.py rename to dlt/destinations/impl/postgres/configuration.py diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py new file mode 100644 index 0000000000..33971eb642 --- /dev/null +++ b/dlt/destinations/impl/postgres/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.postgres.postgres import PostgresClient + + +class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]): + + spec = PostgresClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["PostgresClient"]: + from dlt.destinations.impl.postgres.postgres import PostgresClient + + return PostgresClient + + def __init__( + self, + credentials: t.Union[PostgresCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the Postgres destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the postgres database. Can be an instance of `PostgresCredentials` or + a connection string in the format `postgres://user:password@host:port/database` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py similarity index 95% rename from dlt/destinations/postgres/postgres.py rename to dlt/destinations/impl/postgres/postgres.py index 2812d1d4c4..03c42f4d75 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -11,9 +11,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.postgres import capabilities -from dlt.destinations.postgres.sql_client import Psycopg2SqlClient -from dlt.destinations.postgres.configuration import PostgresClientConfiguration +from dlt.destinations.impl.postgres import capabilities +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient +from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/postgres/sql_client.py b/dlt/destinations/impl/postgres/sql_client.py similarity index 97% rename from dlt/destinations/postgres/sql_client.py rename to dlt/destinations/impl/postgres/sql_client.py index 079a0ae477..b6c4c1a1be 100644 --- a/dlt/destinations/postgres/sql_client.py +++ b/dlt/destinations/impl/postgres/sql_client.py @@ -16,8 +16,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.postgres.configuration import PostgresCredentials -from dlt.destinations.postgres import capabilities +from dlt.destinations.impl.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres import capabilities class Psycopg2SqlClient(SqlClientBase["psycopg2.connection"], DBTransaction): diff --git a/dlt/destinations/impl/qdrant/__init__.py b/dlt/destinations/impl/qdrant/__init__.py new file mode 100644 index 0000000000..1a2c466b14 --- /dev/null +++ b/dlt/destinations/impl/qdrant/__init__.py @@ -0,0 +1,18 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + + return caps diff --git a/dlt/destinations/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py similarity index 100% rename from dlt/destinations/qdrant/configuration.py rename to dlt/destinations/impl/qdrant/configuration.py diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py new file mode 100644 index 0000000000..316b5ae434 --- /dev/null +++ b/dlt/destinations/impl/qdrant/factory.py @@ -0,0 +1,30 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.qdrant.configuration import QdrantCredentials, QdrantClientConfiguration +from dlt.destinations.impl.qdrant import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient + + +class qdrant(Destination[QdrantClientConfiguration, "QdrantClient"]): + + spec = QdrantClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["QdrantClient"]: + from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient + + return QdrantClient + + def __init__( + self, + credentials: t.Union[QdrantCredentials, t.Dict[str, t.Any]] = None, + **kwargs: t.Any, + ) -> None: + super().__init__(credentials=credentials, **kwargs) diff --git a/dlt/destinations/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py similarity index 100% rename from dlt/destinations/qdrant/qdrant_adapter.py rename to dlt/destinations/impl/qdrant/qdrant_adapter.py diff --git a/dlt/destinations/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py similarity index 98% rename from dlt/destinations/qdrant/qdrant_client.py rename to dlt/destinations/impl/qdrant/qdrant_client.py index cba87e9528..029530d624 100644 --- a/dlt/destinations/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -11,9 +11,9 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.qdrant import capabilities -from dlt.destinations.qdrant.configuration import QdrantClientConfiguration -from dlt.destinations.qdrant.qdrant_adapter import VECTORIZE_HINT +from dlt.destinations.impl.qdrant import capabilities +from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration +from dlt.destinations.impl.qdrant.qdrant_adapter import VECTORIZE_HINT from qdrant_client import QdrantClient as QC, models from qdrant_client.qdrant_fastembed import uuid @@ -406,4 +406,4 @@ def _collection_exists(self, table_name: str, qualify_table_name: bool = True) - except UnexpectedResponse as e: if e.status_code == 404: return False - raise e \ No newline at end of file + raise e diff --git a/dlt/destinations/redshift/README.md b/dlt/destinations/impl/redshift/README.md similarity index 100% rename from dlt/destinations/redshift/README.md rename to dlt/destinations/impl/redshift/README.md diff --git a/dlt/destinations/redshift/__init__.py b/dlt/destinations/impl/redshift/__init__.py similarity index 52% rename from dlt/destinations/redshift/__init__.py rename to dlt/destinations/impl/redshift/__init__.py index 96741e86cd..8a8cae84b4 100644 --- a/dlt/destinations/redshift/__init__.py +++ b/dlt/destinations/impl/redshift/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_redshift_identifier, escape_redshift_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration - - -@with_config(spec=RedshiftClientConfiguration, sections=(known_sections.DESTINATION, "redshift",)) -def _configure(config: RedshiftClientConfiguration = config.value) -> RedshiftClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -36,14 +23,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.alter_add_multi_column = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.redshift.redshift import RedshiftClient - - return RedshiftClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return RedshiftClientConfiguration diff --git a/dlt/destinations/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py similarity index 88% rename from dlt/destinations/redshift/configuration.py rename to dlt/destinations/impl/redshift/configuration.py index 7cb13b996f..7018445773 100644 --- a/dlt/destinations/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -4,7 +4,7 @@ from dlt.common.configuration import configspec from dlt.common.utils import digest128 -from dlt.destinations.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration @configspec diff --git a/dlt/destinations/impl/redshift/factory.py b/dlt/destinations/impl/redshift/factory.py new file mode 100644 index 0000000000..7648b35851 --- /dev/null +++ b/dlt/destinations/impl/redshift/factory.py @@ -0,0 +1,45 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.redshift.configuration import RedshiftCredentials, RedshiftClientConfiguration +from dlt.destinations.impl.redshift import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.redshift.redshift import RedshiftClient + + +class redshift(Destination[RedshiftClientConfiguration, "RedshiftClient"]): + + spec = RedshiftClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["RedshiftClient"]: + from dlt.destinations.impl.redshift.redshift import RedshiftClient + + return RedshiftClient + + def __init__( + self, + credentials: t.Union[RedshiftCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + staging_iam_role: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Redshift destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the redshift database. Can be an instance of `RedshiftCredentials` or + a connection string in the format `redshift://user:password@host:port/database` + create_indexes: Should unique indexes be created + staging_iam_role: IAM role to use for staging data in S3 + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + credentials=credentials, create_indexes=create_indexes, staging_iam_role=staging_iam_role, **kwargs + ) diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py similarity index 97% rename from dlt/destinations/redshift/redshift.py rename to dlt/destinations/impl/redshift/redshift.py index 888f27ae7c..2124807bc1 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -1,7 +1,7 @@ import platform import os -from dlt.destinations.postgres.sql_client import Psycopg2SqlClient +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.common.schema.utils import table_schema_has_type, table_schema_has_type_with_precision if platform.python_implementation() == "PyPy": @@ -25,8 +25,8 @@ from dlt.destinations.exceptions import DatabaseTerminalException, LoadJobTerminalException from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob, LoadJob -from dlt.destinations.redshift import capabilities -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration +from dlt.destinations.impl.redshift import capabilities +from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py similarity index 52% rename from dlt/destinations/snowflake/__init__.py rename to dlt/destinations/impl/snowflake/__init__.py index 5d32bc41fd..12e118eeab 100644 --- a/dlt/destinations/snowflake/__init__.py +++ b/dlt/destinations/impl/snowflake/__init__.py @@ -1,20 +1,8 @@ -from typing import Type from dlt.common.data_writers.escape import escape_bigquery_identifier - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.data_writers.escape import escape_snowflake_identifier from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration - - -@with_config(spec=SnowflakeClientConfiguration, sections=(known_sections.DESTINATION, "snowflake",)) -def _configure(config: SnowflakeClientConfiguration = config.value) -> SnowflakeClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: @@ -35,14 +23,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = True caps.alter_add_multi_column = True return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.snowflake.snowflake import SnowflakeClient - - return SnowflakeClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return SnowflakeClientConfiguration diff --git a/dlt/destinations/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py similarity index 100% rename from dlt/destinations/snowflake/configuration.py rename to dlt/destinations/impl/snowflake/configuration.py diff --git a/dlt/destinations/impl/snowflake/factory.py b/dlt/destinations/impl/snowflake/factory.py new file mode 100644 index 0000000000..1201f406b0 --- /dev/null +++ b/dlt/destinations/impl/snowflake/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials, SnowflakeClientConfiguration +from dlt.destinations.impl.snowflake import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +if t.TYPE_CHECKING: + from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient + + +class snowflake(Destination[SnowflakeClientConfiguration, "SnowflakeClient"]): + + spec = SnowflakeClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["SnowflakeClient"]: + from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient + + return SnowflakeClient + + def __init__( + self, + credentials: t.Union[SnowflakeCredentials, t.Dict[str, t.Any], str] = None, + stage_name: t.Optional[str] = None, + keep_staged_files: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the Snowflake destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the snowflake database. Can be an instance of `SnowflakeCredentials` or + a connection string in the format `snowflake://user:password@host:port/database` + stage_name: Name of an existing stage to use for loading data. Default uses implicit stage per table + keep_staged_files: Whether to delete or keep staged files after loading + """ + super().__init__(credentials=credentials, stage_name=stage_name, keep_staged_files=keep_staged_files, **kwargs) diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py similarity index 97% rename from dlt/destinations/snowflake/snowflake.py rename to dlt/destinations/impl/snowflake/snowflake.py index f433ec7e7d..ead3e810d2 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -14,11 +14,11 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.exceptions import LoadJobTerminalException -from dlt.destinations.snowflake import capabilities -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration -from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient +from dlt.destinations.impl.snowflake import capabilities +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration +from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams -from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient +from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/snowflake/sql_client.py b/dlt/destinations/impl/snowflake/sql_client.py similarity index 98% rename from dlt/destinations/snowflake/sql_client.py rename to dlt/destinations/impl/snowflake/sql_client.py index 40cdc990a0..139a5ebb7a 100644 --- a/dlt/destinations/snowflake/sql_client.py +++ b/dlt/destinations/impl/snowflake/sql_client.py @@ -7,8 +7,8 @@ from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.snowflake.configuration import SnowflakeCredentials -from dlt.destinations.snowflake import capabilities +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials +from dlt.destinations.impl.snowflake import capabilities class SnowflakeCursorImpl(DBApiCursorImpl): native_cursor: snowflake_lib.cursor.SnowflakeCursor # type: ignore[assignment] diff --git a/dlt/destinations/weaviate/README.md b/dlt/destinations/impl/weaviate/README.md similarity index 100% rename from dlt/destinations/weaviate/README.md rename to dlt/destinations/impl/weaviate/README.md diff --git a/dlt/destinations/impl/weaviate/__init__.py b/dlt/destinations/impl/weaviate/__init__.py new file mode 100644 index 0000000000..143e0260d2 --- /dev/null +++ b/dlt/destinations/impl/weaviate/__init__.py @@ -0,0 +1,19 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + caps.naming_convention = "dlt.destinations.impl.weaviate.naming" + + return caps diff --git a/dlt/destinations/weaviate/ci_naming.py b/dlt/destinations/impl/weaviate/ci_naming.py similarity index 100% rename from dlt/destinations/weaviate/ci_naming.py rename to dlt/destinations/impl/weaviate/ci_naming.py diff --git a/dlt/destinations/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py similarity index 100% rename from dlt/destinations/weaviate/configuration.py rename to dlt/destinations/impl/weaviate/configuration.py diff --git a/dlt/destinations/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py similarity index 100% rename from dlt/destinations/weaviate/exceptions.py rename to dlt/destinations/impl/weaviate/exceptions.py diff --git a/dlt/destinations/impl/weaviate/factory.py b/dlt/destinations/impl/weaviate/factory.py new file mode 100644 index 0000000000..b29d02b1a7 --- /dev/null +++ b/dlt/destinations/impl/weaviate/factory.py @@ -0,0 +1,47 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.weaviate.configuration import WeaviateCredentials, WeaviateClientConfiguration +from dlt.destinations.impl.weaviate import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient + + +class weaviate(Destination[WeaviateClientConfiguration, "WeaviateClient"]): + + spec = WeaviateClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["WeaviateClient"]: + from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient + + return WeaviateClient + + def __init__( + self, + credentials: t.Union[WeaviateCredentials, t.Dict[str, t.Any]] = None, + vectorizer: str = None, + module_config: t.Dict[str, t.Dict[str, str]] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Weaviate destination to use in a pipeline. + + All destination config parameters can be provided as arguments here and will supersede other config sources (such as dlt config files and environment variables). + + Args: + credentials: Weaviate credentials containing URL, API key and optional headers + vectorizer: The name of the Weaviate vectorizer to use + module_config: The configuration for the Weaviate modules + **kwargs: Additional arguments forwarded to the destination config + """ + super().__init__( + credentials=credentials, + vectorizer=vectorizer, + module_config=module_config, + **kwargs + ) diff --git a/dlt/destinations/weaviate/naming.py b/dlt/destinations/impl/weaviate/naming.py similarity index 100% rename from dlt/destinations/weaviate/naming.py rename to dlt/destinations/impl/weaviate/naming.py diff --git a/dlt/destinations/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py similarity index 100% rename from dlt/destinations/weaviate/weaviate_adapter.py rename to dlt/destinations/impl/weaviate/weaviate_adapter.py diff --git a/dlt/destinations/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py similarity index 98% rename from dlt/destinations/weaviate/weaviate_client.py rename to dlt/destinations/impl/weaviate/weaviate_client.py index d47f08ab59..099cdc7368 100644 --- a/dlt/destinations/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -41,13 +41,13 @@ from dlt.common.data_types import TDataType from dlt.common.storages import FileStorage -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.weaviate import capabilities -from dlt.destinations.weaviate.configuration import WeaviateClientConfiguration -from dlt.destinations.weaviate.exceptions import PropertyNameConflict, WeaviateBatchError +from dlt.destinations.impl.weaviate import capabilities +from dlt.destinations.impl.weaviate.configuration import WeaviateClientConfiguration +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict, WeaviateBatchError from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/qdrant/__init__.py b/dlt/destinations/qdrant/__init__.py deleted file mode 100644 index 7a8619ffcd..0000000000 --- a/dlt/destinations/qdrant/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination.reference import ( - JobClientBase, - DestinationClientConfiguration, -) -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.qdrant.qdrant_adapter import qdrant_adapter - -from dlt.destinations.qdrant.configuration import QdrantClientConfiguration - - -@with_config( - spec=QdrantClientConfiguration, - sections=( - known_sections.DESTINATION, - "qdrant", - ), -) -def _configure( - config: QdrantClientConfiguration = config.value, -) -> QdrantClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - - return caps - - -def client( - schema: Schema, initial_config: DestinationClientConfiguration = config.value -) -> JobClientBase: - from dlt.destinations.qdrant.qdrant_client import QdrantClient - return QdrantClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[QdrantClientConfiguration]: - return QdrantClientConfiguration diff --git a/dlt/destinations/weaviate/__init__.py b/dlt/destinations/weaviate/__init__.py deleted file mode 100644 index ebd87aea0c..0000000000 --- a/dlt/destinations/weaviate/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination.reference import ( - JobClientBase, - DestinationClientConfiguration, -) -from dlt.common.destination import DestinationCapabilitiesContext - -from dlt.destinations.weaviate.weaviate_adapter import weaviate_adapter -from dlt.destinations.weaviate.configuration import WeaviateClientConfiguration - - -@with_config( - spec=WeaviateClientConfiguration, - sections=( - known_sections.DESTINATION, - "weaviate", - ), -) -def _configure( - config: WeaviateClientConfiguration = config.value, -) -> WeaviateClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - caps.naming_convention = "dlt.destinations.weaviate.naming" - - return caps - - -def client( - schema: Schema, initial_config: DestinationClientConfiguration = config.value -) -> JobClientBase: - from dlt.destinations.weaviate.weaviate_client import WeaviateClient - - return WeaviateClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[WeaviateClientConfiguration]: - return WeaviateClientConfiguration diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index 7921e4e2e1..e43e794bf6 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -120,7 +120,7 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: schema_names = ", ".join(sorted(pipeline.schema_names)) st.markdown(f""" * pipeline name: **{pipeline.pipeline_name}** - * destination: **{str(credentials)}** in **{pipeline.destination.__name__}** + * destination: **{str(credentials)}** in **{pipeline.destination.name}** * dataset name: **{pipeline.dataset_name}** * default schema name: **{pipeline.default_schema_name}** * all schema names: **{schema_names}** diff --git a/dlt/load/load.py b/dlt/load/load.py index beae130789..725f8589f5 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -20,7 +20,7 @@ from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TTableSchema, TWriteDisposition from dlt.common.storages import LoadStorage -from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, DestinationReference, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination +from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, Destination, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination, TDestination from dlt.destinations.job_impl import EmptyLoadJob @@ -34,8 +34,8 @@ class Load(Runnable[Executor]): @with_config(spec=LoaderConfiguration, sections=(known_sections.LOAD,)) def __init__( self, - destination: DestinationReference, - staging_destination: DestinationReference = None, + destination: TDestination, + staging_destination: TDestination = None, collector: Collector = NULL_COLLECTOR, is_storage_owner: bool = False, config: LoaderConfiguration = config.value, @@ -54,7 +54,6 @@ def __init__( self._processed_load_ids: Dict[str, str] = {} """Load ids to dataset name""" - def create_storage(self, is_storage_owner: bool) -> LoadStorage: supported_file_formats = self.capabilities.supported_loader_file_formats if self.staging_destination: diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 71c37c40ba..af7dd12294 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -7,12 +7,13 @@ from dlt.common.configuration import with_config from dlt.common.configuration.container import Container from dlt.common.configuration.inject import get_orig_args, last_config -from dlt.common.destination.reference import DestinationReference, TDestinationReferenceArg +from dlt.common.destination import Destination, TDestinationReferenceArg from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import _from_name as collector_from_name, TCollectorArg, _NULL_COLLECTOR +from dlt.pipeline.deprecations import credentials_argument_deprecated @overload @@ -104,6 +105,8 @@ def pipeline( # is any of the arguments different from defaults has_arguments = bool(orig_args[0]) or any(orig_args[1].values()) + credentials_argument_deprecated("pipeline", credentials, destination) + if not has_arguments: context = Container()[PipelineContext] # if pipeline instance is already active then return it, otherwise create a new one @@ -116,8 +119,8 @@ def pipeline( if not pipelines_dir: pipelines_dir = get_dlt_pipelines_dir() - destination = DestinationReference.from_name(destination or kwargs["destination_name"]) - staging = DestinationReference.from_name(staging or kwargs.get("staging_name", None)) if staging is not None else None + destination = Destination.from_reference(destination or kwargs["destination_name"]) + staging = Destination.from_reference(staging or kwargs.get("staging_name", None)) if staging is not None else None progress = collector_from_name(progress) # create new pipeline instance @@ -224,7 +227,7 @@ def run( Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ - destination = DestinationReference.from_name(destination) + destination = Destination.from_reference(destination, credentials=credentials) return pipeline().run( data, destination=destination, diff --git a/dlt/pipeline/deprecations.py b/dlt/pipeline/deprecations.py new file mode 100644 index 0000000000..138167c8d3 --- /dev/null +++ b/dlt/pipeline/deprecations.py @@ -0,0 +1,20 @@ +import typing as t +import warnings + +from dlt.common.destination import Destination, TDestinationReferenceArg + + +def credentials_argument_deprecated( + caller_name: str, credentials: t.Optional[t.Any], destination: TDestinationReferenceArg = None +) -> None: + if credentials is None: + return + + dest_name = Destination.to_name(destination) if destination else "postgres" + + warnings.warn( + f"The `credentials argument` to {caller_name} is deprecated and will be removed in a future version. " + f"Pass the same credentials to the `destination` instance instead, e.g. {caller_name}(destination=dlt.destinations.{dest_name}(credentials=...))", + DeprecationWarning, + stacklevel=2, + ) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index b948ad8040..465eccfdb6 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -24,8 +24,8 @@ from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner from dlt.common.storages import LiveSchemaStorage, NormalizeStorage, LoadStorage, SchemaStorage, FileStorage, NormalizeStorageConfiguration, SchemaStorageConfiguration, LoadStorageConfiguration -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import (DestinationClientDwhConfiguration, WithStateSync, DestinationReference, JobClientBase, DestinationClientConfiguration, +from dlt.common.destination import DestinationCapabilitiesContext, TDestination +from dlt.common.destination.reference import (DestinationClientDwhConfiguration, WithStateSync, Destination, JobClientBase, DestinationClientConfiguration, TDestinationReferenceArg, DestinationClientStagingConfiguration, DestinationClientStagingConfiguration, DestinationClientDwhWithStagingConfiguration) from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS @@ -52,6 +52,7 @@ from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state from dlt.common.schema.utils import normalize_schema_name +from dlt.pipeline.deprecations import credentials_argument_deprecated def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -166,9 +167,9 @@ class Pipeline(SupportsPipeline): """A directory where the pipelines' working directories are created""" working_dir: str """A working directory of the pipeline""" - destination: DestinationReference = None - staging: DestinationReference = None - """The destination reference which is ModuleType. `destination.__name__` returns the name string""" + destination: TDestination = None + staging: TDestination = None + """The destination reference which is ModuleType. `destination.name` returns the name string""" dataset_name: str = None """Name of the dataset to which pipeline will be loaded to""" credentials: Any = None @@ -183,8 +184,8 @@ def __init__( pipeline_name: str, pipelines_dir: str, pipeline_salt: TSecretValue, - destination: DestinationReference, - staging: DestinationReference, + destination: TDestination, + staging: TDestination, dataset_name: str, credentials: Any, import_schema_path: str, @@ -342,6 +343,9 @@ def load( # set destination and default dataset if provided self._set_destinations(destination, None) self._set_dataset_name(dataset_name) + + credentials_argument_deprecated("pipeline.load", credentials, destination) + self.credentials = credentials or self.credentials # check if any schema is present, if not then no data was extracted @@ -449,6 +453,8 @@ def run( self._set_destinations(destination, staging) self._set_dataset_name(dataset_name) + credentials_argument_deprecated("pipeline.run", credentials, self.destination) + # sync state with destination if self.config.restore_from_destination and not self.full_refresh and not self._state_restored and (self.destination or destination): self.sync_destination(destination, staging, dataset_name) @@ -732,7 +738,7 @@ def _sql_job_client(self, schema: Schema, credentials: Any = None) -> SqlJobClie if isinstance(client, SqlJobClientBase): return client else: - raise SqlClientNotAvailable(self.pipeline_name, self.destination.__name__) + raise SqlClientNotAvailable(self.pipeline_name, self.destination.name) def _get_normalize_storage(self) -> NormalizeStorage: return NormalizeStorage(True, self._normalize_storage_config) @@ -893,7 +899,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para return extract_id - def _get_destination_client_initial_config(self, destination: DestinationReference = None, credentials: Any = None, as_staging: bool = False) -> DestinationClientConfiguration: + def _get_destination_client_initial_config(self, destination: TDestination = None, credentials: Any = None, as_staging: bool = False) -> DestinationClientConfiguration: destination = destination or self.destination if not destination: raise PipelineConfigMissing( @@ -903,7 +909,7 @@ def _get_destination_client_initial_config(self, destination: DestinationReferen "Please provide `destination` argument to `pipeline`, `run` or `load` method directly or via .dlt config.toml file or environment variable." ) # create initial destination client config - client_spec = destination.spec() + client_spec = destination.spec # initialize explicit credentials if not as_staging: # explicit credentials passed to dlt.pipeline should not be applied to staging @@ -999,17 +1005,19 @@ def _set_context(self, is_active: bool) -> None: del self._container[DestinationCapabilitiesContext] def _set_destinations(self, destination: TDestinationReferenceArg, staging: TDestinationReferenceArg) -> None: - destination_mod = DestinationReference.from_name(destination) - self.destination = destination_mod or self.destination + # destination_mod = DestinationReference.from_name(destination) + if destination: + self.destination = Destination.from_reference(destination) if destination and not self.destination.capabilities().supported_loader_file_formats and not staging: - logger.warning(f"The destination {destination_mod.__name__} requires the filesystem staging destination to be set, but it was not provided. Setting it to 'filesystem'.") + logger.warning(f"The destination {self.destination.name} requires the filesystem staging destination to be set, but it was not provided. Setting it to 'filesystem'.") staging = "filesystem" if staging: - staging_module = DestinationReference.from_name(staging) - if staging_module and not issubclass(staging_module.spec(), DestinationClientStagingConfiguration): - raise DestinationNoStagingMode(staging_module.__name__) + # staging_module = DestinationReference.from_name(staging) + staging_module = Destination.from_reference(staging) + if staging_module and not issubclass(staging_module.spec, DestinationClientStagingConfiguration): + raise DestinationNoStagingMode(staging_module.name) self.staging = staging_module or self.staging with self._maybe_destination_capabilities(): @@ -1028,8 +1036,10 @@ def _maybe_destination_capabilities(self, loader_file_format: TLoaderFileFormat caps = injected_caps.__enter__() caps.preferred_loader_file_format = self._resolve_loader_file_format( - DestinationReference.to_name(self.destination), - DestinationReference.to_name(self.staging) if self.staging else None, + self.destination.name, + # DestinationReference.to_name(self.destination), + self.staging.name if self.staging else None, + # DestinationReference.to_name(self.staging) if self.staging else None, destination_caps, stage_caps, loader_file_format) caps.supported_loader_file_formats = ( destination_caps.supported_staging_file_formats if stage_caps else None @@ -1157,12 +1167,12 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]: if isinstance(job_client, WithStateSync): state = load_state_from_destination(self.pipeline_name, job_client) if state is None: - logger.info(f"The state was not found in the destination {self.destination.__name__}:{dataset_name}") + logger.info(f"The state was not found in the destination {self.destination.name}:{dataset_name}") else: - logger.info(f"The state was restored from the destination {self.destination.__name__}:{dataset_name}") + logger.info(f"The state was restored from the destination {self.destination.name}:{dataset_name}") else: state = None - logger.info(f"Destination does not support metadata storage {self.destination.__name__}:{dataset_name}") + logger.info(f"Destination does not support metadata storage {self.destination.name}:{dataset_name}") return state finally: # restore the use_single_dataset option @@ -1177,17 +1187,17 @@ def _get_schemas_from_destination(self, schema_names: Sequence[str], always_down if not self._schema_storage.has_schema(schema.name) or always_download: with self._get_destination_clients(schema)[0] as job_client: if not isinstance(job_client, WithStateSync): - logger.info(f"Destination does not support metadata storage {self.destination.__name__}") + logger.info(f"Destination does not support metadata storage {self.destination.name}") return restored_schemas schema_info = job_client.get_stored_schema() if schema_info is None: - logger.info(f"The schema {schema.name} was not found in the destination {self.destination.__name__}:{self.dataset_name}") + logger.info(f"The schema {schema.name} was not found in the destination {self.destination.name}:{self.dataset_name}") # try to import schema with contextlib.suppress(FileNotFoundError): self._schema_storage.load_schema(schema.name) else: schema = Schema.from_dict(json.loads(schema_info.schema)) - logger.info(f"The schema {schema.name} version {schema.version} hash {schema.stored_version_hash} was restored from the destination {self.destination.__name__}:{self.dataset_name}") + logger.info(f"The schema {schema.name} version {schema.version} hash {schema.stored_version_hash} was restored from the destination {self.destination.name}:{self.dataset_name}") restored_schemas.append(schema) return restored_schemas @@ -1244,7 +1254,7 @@ def _state_to_props(self, state: TPipelineState) -> None: if prop in state["_local"] and not prop.startswith("_"): setattr(self, prop, state["_local"][prop]) # type: ignore if "destination" in state: - self._set_destinations(DestinationReference.from_name(self.destination), DestinationReference.from_name(self.staging) if "staging" in state else None ) + self._set_destinations(self.destination, self.staging if "staging" in state else None ) def _props_to_state(self, state: TPipelineState) -> None: """Write pipeline props to `state`""" @@ -1255,9 +1265,9 @@ def _props_to_state(self, state: TPipelineState) -> None: if not prop.startswith("_"): state["_local"][prop] = getattr(self, prop) # type: ignore if self.destination: - state["destination"] = self.destination.__name__ + state["destination"] = self.destination.name if self.staging: - state["staging"] = self.staging.__name__ + state["staging"] = self.staging.name state["schema_names"] = self._schema_storage.list_schemas() def _save_state(self, state: TPipelineState) -> None: diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index ec42bc788f..07e9a2d137 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -9,7 +9,7 @@ from dlt.common.runtime.segment import track as dlthub_telemetry_track from dlt.common.runtime.slack import send_slack_message from dlt.common.pipeline import LoadInfo, ExtractInfo, SupportsPipeline -from dlt.common.destination import DestinationReference +from dlt.common.destination import Destination from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace @@ -21,7 +21,7 @@ def _add_sentry_tags(span: Span, pipeline: SupportsPipeline) -> None: span.set_tag("pipeline_name", pipeline.pipeline_name) if pipeline.destination: - span.set_tag("destination", pipeline.destination.__name__) + span.set_tag("destination", pipeline.destination.name) if pipeline.dataset_name: span.set_tag("dataset_name", pipeline.dataset_name) except ImportError: @@ -87,7 +87,7 @@ def on_end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: S props = { "elapsed": (step.finished_at - trace.started_at).total_seconds(), "success": step.step_exception is None, - "destination_name": DestinationReference.to_name(pipeline.destination) if pipeline.destination else None, + "destination_name": pipeline.destination.name if pipeline.destination else None, "pipeline_name_hash": digest128(pipeline.pipeline_name), "dataset_name_hash": digest128(pipeline.dataset_name) if pipeline.dataset_name else None, "default_schema_name_hash": digest128(pipeline.default_schema_name) if pipeline.default_schema_name else None, @@ -107,4 +107,4 @@ def on_end_trace(trace: PipelineTrace, pipeline: SupportsPipeline) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"---END SENTRY TX: {trace.transaction_id} SCOPE: {Hub.current.scope}") with contextlib.suppress(Exception): - Hub.current.scope.span.__exit__(None, None, None) \ No newline at end of file + Hub.current.scope.span.__exit__(None, None, None) diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 8c626266a4..fe7dafc243 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -252,7 +252,7 @@ it will be normalized to: so your best course of action is to clean up the data yourself before loading and use default naming convention. Nevertheless you can configure the alternative in `config.toml`: ```toml [schema] -naming="dlt.destinations.weaviate.ci_naming" +naming="dlt.destinations.weaviate.impl.ci_naming" ``` ## Additional destination options diff --git a/docs/website/docs/getting-started-snippets.py b/docs/website/docs/getting-started-snippets.py index c4bd789834..be21a7f757 100644 --- a/docs/website/docs/getting-started-snippets.py +++ b/docs/website/docs/getting-started-snippets.py @@ -290,7 +290,7 @@ def pdf_to_weaviate_snippet() -> None: import os import dlt - from dlt.destinations.weaviate import weaviate_adapter + from dlt.destinations.impl.weaviate import weaviate_adapter from PyPDF2 import PdfReader diff --git a/tests/cli/test_pipeline_command.py b/tests/cli/test_pipeline_command.py index 401517f3c5..19bb5fa277 100644 --- a/tests/cli/test_pipeline_command.py +++ b/tests/cli/test_pipeline_command.py @@ -45,7 +45,7 @@ def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) pipeline_command.pipeline_command("info", "chess_pipeline", None, 0) _out = buf.getvalue() # do we have duckdb destination - assert "dlt.destinations.duckdb" in _out + assert "destination: duckdb" in _out print(_out) with io.StringIO() as buf, contextlib.redirect_stdout(buf): diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 66b8f765c7..9d655bc4db 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -5,7 +5,7 @@ from dlt.common import pendulum, json from dlt.common.typing import AnyFun # from dlt.destinations.postgres import capabilities -from dlt.destinations.redshift import capabilities as redshift_caps +from dlt.destinations.impl.redshift import capabilities as redshift_caps from dlt.common.data_writers.escape import escape_redshift_identifier, escape_bigquery_identifier, escape_redshift_literal, escape_postgres_literal, escape_duckdb_literal from dlt.common.data_writers.writers import DataWriter, InsertValuesWriter, JsonlWriter, ParquetDataWriter diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index 7afa10ed68..5483a95f45 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -1,6 +1,7 @@ import pytest -from dlt.common.destination.reference import DestinationClientDwhConfiguration, DestinationReference +from dlt.common.destination.reference import DestinationClientDwhConfiguration, Destination +from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema from dlt.common.schema.exceptions import InvalidDatasetName @@ -11,24 +12,24 @@ def test_import_unknown_destination() -> None: # standard destination with pytest.raises(UnknownDestinationModule): - DestinationReference.from_name("meltdb") + Destination.from_reference("meltdb") # custom module with pytest.raises(UnknownDestinationModule): - DestinationReference.from_name("melt.db") + Destination.from_reference("melt.db") def test_invalid_destination_reference() -> None: with pytest.raises(InvalidDestinationReference): - DestinationReference.from_name("tests.load.cases.fake_destination") + Destination.from_reference("tests.load.cases.fake_destination.not_a_destination") def test_import_all_destinations() -> None: # this must pass without the client dependencies being imported - for module in ACTIVE_DESTINATIONS: - dest = DestinationReference.from_name(module) - assert dest.__name__ == "dlt.destinations." + module + for dest_name in ACTIVE_DESTINATIONS: + dest = Destination.from_reference(dest_name) + assert dest.name == dest_name dest.spec() - dest.capabilities() + assert isinstance(dest.capabilities(), DestinationCapabilitiesContext) def test_normalize_dataset_name() -> None: diff --git a/tests/helpers/dbt_tests/local/test_dbt_utils.py b/tests/helpers/dbt_tests/local/test_dbt_utils.py index 71e570bd69..133ecf1617 100644 --- a/tests/helpers/dbt_tests/local/test_dbt_utils.py +++ b/tests/helpers/dbt_tests/local/test_dbt_utils.py @@ -7,7 +7,7 @@ from dlt.common.storages import FileStorage from dlt.common.utils import uniq_id -from dlt.destinations.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres.configuration import PostgresCredentials from dlt.helpers.dbt.dbt_utils import DBTProcessingError, initialize_dbt_logging, run_dbt_command, is_incremental_schema_out_of_sync_error from tests.utils import test_storage, preserve_environ diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index b418bf15b6..1037908e59 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -14,8 +14,8 @@ from dlt.common.runners.synth_pickle import decode_obj, encode_obj from dlt.common.typing import AnyFun -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.bigquery import BigQueryClientConfiguration +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.helpers.dbt.configuration import DBTRunnerConfiguration from dlt.helpers.dbt.exceptions import PrerequisitesException, DBTProcessingError from dlt.helpers.dbt import package_runner, create_venv, _create_dbt_deps, _default_profile_name, DEFAULT_DBT_VERSION diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index 145898cde3..abbaf8d414 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -14,7 +14,7 @@ from dlt.common.storages import FileStorage from dlt.common.utils import digest128, uniq_id, custom_environ -from dlt.destinations.bigquery.bigquery import BigQueryClient, BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.bigquery import BigQueryClient, BigQueryClientConfiguration from dlt.destinations.exceptions import LoadJobNotExistsException, LoadJobTerminalException from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, preserve_environ @@ -242,7 +242,7 @@ def test_bigquery_job_errors(client: BigQueryClient, file_storage: FileStorage) @pytest.mark.parametrize('location', ["US", "EU"]) def test_bigquery_location(location: str, file_storage: FileStorage) -> None: - with cm_yield_client_with_storage("bigquery", default_config_values={"location": location}) as client: + with cm_yield_client_with_storage("bigquery", default_config_values={"credentials": {"location": location}}) as client: user_table_name = prepare_table(client) load_json = { "_dlt_id": uniq_id(), diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index a3222ba020..0d8ab1c8c2 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -8,8 +8,8 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.destinations.bigquery.bigquery import BigQueryClient -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.bigquery import BigQueryClient +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/cases/fake_destination.py b/tests/load/cases/fake_destination.py index 152b2db918..016cc19020 100644 --- a/tests/load/cases/fake_destination.py +++ b/tests/load/cases/fake_destination.py @@ -1 +1,6 @@ -# module that is used to test wrong destination references \ No newline at end of file +# module that is used to test wrong destination references + + +class not_a_destination: + def __init__(self, **kwargs) -> None: + pass diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 6c362a6b76..ddfc681a84 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -6,7 +6,8 @@ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.utils import get_resolved_traces -from dlt.destinations.duckdb.configuration import DUCK_DB_NAME, DuckDbClientConfiguration, DuckDbCredentials, DEFAULT_DUCK_DB_NAME +from dlt.destinations.impl.duckdb.configuration import DUCK_DB_NAME, DuckDbClientConfiguration, DuckDbCredentials, DEFAULT_DUCK_DB_NAME +from dlt.destinations import duckdb from tests.load.pipeline.utils import drop_pipeline, assert_table from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT @@ -46,13 +47,13 @@ def test_duckdb_open_conn_default() -> None: def test_duckdb_database_path() -> None: # resolve without any path provided c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) - assert c.credentials.database.lower() == os.path.abspath("quack.duckdb").lower() + assert c.credentials._conn_str().lower() == os.path.abspath("quack.duckdb").lower() # resolve without any path but with pipeline context p = dlt.pipeline(pipeline_name="quack_pipeline") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) # still cwd db_path = os.path.abspath(os.path.join(".", "quack_pipeline.duckdb")) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() # we do not keep default duckdb path in the local state with pytest.raises(KeyError): p.get_local_state_val("duckdb_database") @@ -69,7 +70,7 @@ def test_duckdb_database_path() -> None: # test special :pipeline: path to create in pipeline folder c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=":pipeline:")) db_path = os.path.abspath(os.path.join(p.working_dir, DEFAULT_DUCK_DB_NAME)) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() # connect conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) @@ -80,7 +81,7 @@ def test_duckdb_database_path() -> None: # provide relative path db_path = "_storage/test_quack.duckdb" c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials="duckdb:///_storage/test_quack.duckdb")) - assert c.credentials.database.lower() == os.path.abspath(db_path).lower() + assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -90,7 +91,7 @@ def test_duckdb_database_path() -> None: db_path = os.path.abspath("_storage/abs_test_quack.duckdb") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=f"duckdb:///{db_path}")) assert os.path.isabs(c.credentials.database) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -99,7 +100,7 @@ def test_duckdb_database_path() -> None: # set just path as credentials db_path = "_storage/path_test_quack.duckdb" c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) - assert c.credentials.database.lower() == os.path.abspath(db_path).lower() + assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -108,7 +109,7 @@ def test_duckdb_database_path() -> None: db_path = os.path.abspath("_storage/abs_path_test_quack.duckdb") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) assert os.path.isabs(c.credentials.database) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -128,7 +129,7 @@ def test_keeps_initial_db_path() -> None: print(p.pipelines_dir) with p.sql_client() as conn: # still cwd - assert conn.credentials.database.lower() == os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower() # but it is kept in the local state assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() @@ -138,7 +139,7 @@ def test_keeps_initial_db_path() -> None: with p.sql_client() as conn: # still cwd assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() - assert conn.credentials.database.lower() == os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower() # now create a new pipeline dlt.pipeline(pipeline_name="not_quack", destination="dummy") @@ -147,12 +148,12 @@ def test_keeps_initial_db_path() -> None: assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() # new pipeline context took over # TODO: restore pipeline context on each call - assert conn.credentials.database.lower() != os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() != os.path.abspath(db_path).lower() def test_duckdb_database_delete() -> None: db_path = "_storage/path_test_quack.duckdb" - p = dlt.pipeline(pipeline_name="quack_pipeline", credentials=db_path, destination="duckdb") + p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(credentials=db_path)) p.run([1, 2, 3], table_name="table", dataset_name="dataset") # attach the pipeline p = dlt.attach(pipeline_name="quack_pipeline") diff --git a/tests/load/duckdb/test_duckdb_table_builder.py b/tests/load/duckdb/test_duckdb_table_builder.py index 247d134b06..a5870763fc 100644 --- a/tests/load/duckdb/test_duckdb_table_builder.py +++ b/tests/load/duckdb/test_duckdb_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.duckdb.duck import DuckDbClient -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb.duck import DuckDbClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index 4a167fa016..582847bfa2 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -3,7 +3,7 @@ from dlt.common.configuration.resolve import resolve_configuration -from dlt.destinations.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration from tests.utils import patch_home_dir, preserve_environ, skip_if_not_active diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index f290892e18..0055f37716 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -6,7 +6,7 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.storages import LoadStorage, FileStorage -from dlt.destinations.filesystem.filesystem import LoadFilesystemJob, FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem.filesystem import LoadFilesystemJob, FilesystemDestinationClientConfiguration from tests.load.filesystem.utils import perform_load from tests.utils import clean_test_storage, init_test_logging diff --git a/tests/load/filesystem/utils.py b/tests/load/filesystem/utils.py index eebfa6e87c..8186e82c3b 100644 --- a/tests/load/filesystem/utils.py +++ b/tests/load/filesystem/utils.py @@ -5,16 +5,16 @@ from dlt.load import Load from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import DestinationReference, LoadJob +from dlt.common.destination.reference import Destination, LoadJob, TDestination from dlt.destinations import filesystem -from dlt.destinations.filesystem.filesystem import FilesystemClient +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.job_impl import EmptyLoadJob from tests.load.utils import prepare_load_package def setup_loader(dataset_name: str) -> Load: - destination: DestinationReference = filesystem # type: ignore[assignment] - config = filesystem.spec()(dataset_name=dataset_name) + destination: TDestination = filesystem() # type: ignore[assignment] + config = filesystem.spec(dataset_name=dataset_name) # setup loader with Container().injectable_context(ConfigSectionContext(sections=('filesystem',))): return Load( diff --git a/tests/load/mssql/test_mssql_credentials.py b/tests/load/mssql/test_mssql_credentials.py index 9b57692bb2..5428246247 100644 --- a/tests/load/mssql/test_mssql_credentials.py +++ b/tests/load/mssql/test_mssql_credentials.py @@ -1,6 +1,6 @@ from dlt.common.configuration import resolve_configuration -from dlt.destinations.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index 4f5a6637d6..114d94a20f 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -7,8 +7,8 @@ pytest.importorskip("dlt.destinations.mssql.mssql", reason="MSSQL ODBC driver not installed") -from dlt.destinations.mssql.mssql import MsSqlClient -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials +from dlt.destinations.impl.mssql.mssql import MsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8e810015f2..dce65bc8d7 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -4,7 +4,7 @@ import dlt, os from dlt.common.utils import uniq_id from dlt.common.storages.load_storage import LoadJobInfo -from dlt.destinations.filesystem.filesystem import FilesystemClient, LoadFilesystemJob +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient, LoadFilesystemJob from dlt.common.schema.typing import LOADS_TABLE_NAME from tests.utils import skip_if_not_active diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 99071a7ac6..2fc4aad1a8 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -8,7 +8,7 @@ from dlt.common.pipeline import SupportsPipeline from dlt.common import json, sleep -from dlt.common.destination.reference import DestinationReference +from dlt.common.destination import Destination from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.typing import TDataItem @@ -66,8 +66,8 @@ def data_fun() -> Iterator[Any]: # mock the correct destinations (never do that in normal code) with p.managed_state(): p._set_destinations( - DestinationReference.from_name(destination_config.destination), - DestinationReference.from_name(destination_config.staging) if destination_config.staging else None + Destination.from_reference(destination_config.destination), + Destination.from_reference(destination_config.staging) if destination_config.staging else None ) # does not reset the dataset name assert p.dataset_name in possible_dataset_names diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 752571591c..113585f669 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -16,7 +16,7 @@ from tests.load.utils import DestinationTestConfiguration, destinations_configs if TYPE_CHECKING: - from dlt.destinations.filesystem.filesystem import FilesystemClient + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @pytest.fixture(autouse=True) def drop_pipeline(request) -> Iterator[None]: @@ -67,7 +67,7 @@ def _drop_dataset(schema_name: str) -> None: def _is_filesystem(p: dlt.Pipeline) -> bool: if not p.destination: return False - return p.destination.__name__.rsplit('.', 1)[-1] == 'filesystem' + return p.destination.name == 'filesystem' def assert_table(p: dlt.Pipeline, table_name: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None) -> None: diff --git a/tests/load/postgres/test_postgres_client.py b/tests/load/postgres/test_postgres_client.py index dcc242cf50..65ac61cfd4 100644 --- a/tests/load/postgres/test_postgres_client.py +++ b/tests/load/postgres/test_postgres_client.py @@ -7,9 +7,9 @@ from dlt.common.storages import FileStorage from dlt.common.utils import uniq_id -from dlt.destinations.postgres.configuration import PostgresCredentials -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.sql_client import psycopg2 +from dlt.destinations.impl.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.sql_client import psycopg2 from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, skipifpypy, preserve_environ from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 165c62a468..1d6965c0c0 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.configuration import PostgresClientConfiguration, PostgresCredentials +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration, PostgresCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index 303a5de69f..760eec4631 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -5,8 +5,8 @@ from dlt.common import json from dlt.common.utils import uniq_id -from dlt.destinations.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT -from dlt.destinations.qdrant.qdrant_client import QdrantClient +from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT +from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient from tests.pipeline.utils import assert_load_info from tests.load.qdrant.utils import drop_active_pipeline_data, assert_collection diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index 96b582a28e..1dfacbee7f 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -5,7 +5,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.configuration.container import Container -from dlt.destinations.qdrant.qdrant_client import QdrantClient +from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient def assert_unordered_list_equal(list1: List[Any], list2: List[Any]) -> None: diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index 9839965b70..7f617024df 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -12,8 +12,8 @@ from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DatabaseTerminalException -from dlt.destinations.redshift.configuration import RedshiftCredentials -from dlt.destinations.redshift.redshift import RedshiftClient, psycopg2 +from dlt.destinations.impl.redshift.configuration import RedshiftCredentials +from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index 8c61ccc1f2..2e0feb44e7 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -6,8 +6,8 @@ from dlt.common.schema import Schema from dlt.common.configuration import resolve_configuration -from dlt.destinations.redshift.redshift import RedshiftClient -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration, RedshiftCredentials +from dlt.destinations.impl.redshift.redshift import RedshiftClient +from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration, RedshiftCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index 7108ad06e5..abf80a1241 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -9,7 +9,7 @@ from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.utils import digest128 -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials from tests.common.configuration.utils import environment diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 81164625f9..9ede1c8d13 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.snowflake.snowflake import SnowflakeClient -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 1216906967..9edc49a607 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -11,14 +11,14 @@ from dlt.common.storages import FileStorage, LoadStorage from dlt.common.storages.load_storage import JobWithUnsupportedWriterException from dlt.common.utils import uniq_id -from dlt.common.destination.reference import DestinationReference, LoadJob +from dlt.common.destination.reference import Destination, LoadJob, TDestination from dlt.load import Load from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations import dummy -from dlt.destinations.dummy import dummy as dummy_impl -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy import dummy as dummy_impl +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration from dlt.load.exceptions import LoadClientJobFailed, LoadClientJobRetry from dlt.common.schema.utils import get_top_level_table @@ -445,7 +445,7 @@ def run_all(load: Load) -> None: def setup_loader(delete_completed_jobs: bool = False, client_config: DummyClientConfiguration = None) -> Load: # reset jobs for a test dummy_impl.JOBS = {} - destination: DestinationReference = dummy # type: ignore[assignment] + destination: TDestination = dummy() # type: ignore[assignment] client_config = client_config or DummyClientConfiguration(loader_file_format="jsonl") # patch destination to provide client_config # destination.client = lambda schema: dummy_impl.DummyClient(schema, client_config) diff --git a/tests/load/test_insert_job_client.py b/tests/load/test_insert_job_client.py index 95e63a79f2..86049b035a 100644 --- a/tests/load/test_insert_job_client.py +++ b/tests/load/test_insert_job_client.py @@ -52,7 +52,7 @@ def test_simple_load(client: InsertValuesJobClient, file_storage: FileStorage) - def test_loading_errors(client: InsertValuesJobClient, file_storage: FileStorage) -> None: # test expected dbiapi exceptions for supported destinations import duckdb - from dlt.destinations.postgres.sql_client import psycopg2 + from dlt.destinations.impl.postgres.sql_client import psycopg2 TNotNullViolation = psycopg2.errors.NotNullViolation TNumericValueOutOfRange = psycopg2.errors.NumericValueOutOfRange diff --git a/tests/load/utils.py b/tests/load/utils.py index be2097c879..f591f51585 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -12,8 +12,8 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import DestinationClientDwhConfiguration, DestinationReference, JobClientBase, LoadJob, DestinationClientStagingConfiguration, WithStagingDataset, TDestinationReferenceArg -from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import DestinationClientDwhConfiguration, JobClientBase, LoadJob, DestinationClientStagingConfiguration, WithStagingDataset, TDestinationReferenceArg +from dlt.common.destination import TLoaderFileFormat, Destination from dlt.common.data_writers import DataWriter from dlt.common.schema import TColumnSchema, TTableSchemaColumns, Schema from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration @@ -229,15 +229,15 @@ def yield_client( ) -> Iterator[SqlJobClientBase]: os.environ.pop("DATASET_NAME", None) # import destination reference by name - destination = import_module(f"dlt.destinations.{destination_name}") + destination = Destination.from_reference(destination_name) # create initial config dest_config: DestinationClientDwhConfiguration = None - dest_config = destination.spec()() + dest_config = destination.spec() # type: ignore[assignment] dest_config.dataset_name = dataset_name # type: ignore[misc] # TODO: Why is dataset_name final? if default_config_values is not None: # apply the values to credentials, if dict is provided it will be used as default - dest_config.credentials = default_config_values # type: ignore[assignment] + # dest_config.credentials = default_config_values # type: ignore[assignment] # also apply to config dest_config.update(default_config_values) # get event default schema @@ -261,7 +261,7 @@ def yield_client( # lookup for credentials in the section that is destination name with Container().injectable_context(ConfigSectionContext(sections=("destination", destination_name,))): - with destination.client(schema, dest_config) as client: + with destination.client(schema, dest_config) as client: # type: ignore[assignment] yield client @contextlib.contextmanager diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index a965201425..25258a2479 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -1,7 +1,7 @@ import dlt, pytest -from dlt.destinations.weaviate.naming import NamingConvention -from dlt.destinations.weaviate.ci_naming import NamingConvention as CINamingConvention +from dlt.destinations.impl.weaviate.naming import NamingConvention +from dlt.destinations.impl.weaviate.ci_naming import NamingConvention as CINamingConvention from tests.common.utils import load_yml_case diff --git a/tests/load/weaviate/test_pipeline.py b/tests/load/weaviate/test_pipeline.py index 339c94575e..691281c63e 100644 --- a/tests/load/weaviate/test_pipeline.py +++ b/tests/load/weaviate/test_pipeline.py @@ -6,10 +6,10 @@ from dlt.common.schema import Schema from dlt.common.utils import uniq_id -from dlt.destinations.weaviate import weaviate_adapter -from dlt.destinations.weaviate.exceptions import PropertyNameConflict -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT -from dlt.destinations.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate import weaviate_adapter +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_load_info @@ -374,7 +374,7 @@ def test_vectorize_property_without_data() -> None: # set the naming convention to case insensitive # os.environ["SCHEMA__NAMING"] = "direct" - dlt.config["schema.naming"] = "dlt.destinations.weaviate.ci_naming" + dlt.config["schema.naming"] = "dlt.destinations.impl.weaviate.ci_naming" # create new schema with changed naming convention p = p.drop() info = p.run(weaviate_adapter(["there are", "no stop", "words in here"], vectorize="vAlue"), primary_key="vALue", columns={"vAlue": {"data_type": "text"}}) diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index d102610f68..ca9d853d98 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -9,8 +9,8 @@ from dlt.common.schema.typing import TWriteDisposition, TColumnSchema, TTableSchemaColumns from dlt.destinations import weaviate -from dlt.destinations.weaviate.exceptions import PropertyNameConflict -from dlt.destinations.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient from dlt.common.storages.file_storage import FileStorage from dlt.common.schema.utils import new_table @@ -27,9 +27,10 @@ def drop_weaviate_schema() -> Iterator[None]: def get_client_instance(schema: Schema) -> WeaviateClient: - config = weaviate.spec()(dataset_name="ClientTest" + uniq_id()) - with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): - return weaviate.client(schema, config) # type: ignore[return-value] + dest = weaviate(dataset_name="ClientTest" + uniq_id()) + return dest.client(schema, dest.spec()) + # with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): + # return dest.client(schema, config) @pytest.fixture(scope='function') @@ -44,7 +45,7 @@ def ci_client() -> Iterator[WeaviateClient]: def make_client(naming_convention: str) -> Iterator[WeaviateClient]: schema = Schema('test_schema', { - 'names': f"dlt.destinations.weaviate.{naming_convention}", + 'names': f"dlt.destinations.impl.weaviate.{naming_convention}", 'json': None }) _client = get_client_instance(schema) diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index d5568b0598..ed378191e6 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -6,8 +6,8 @@ from dlt.common.configuration.container import Container from dlt.common.schema.utils import get_columns_names_with_prop -from dlt.destinations.weaviate.weaviate_client import WeaviateClient -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT def assert_unordered_list_equal(list1: List[Any], list2: List[Any]) -> None: diff --git a/tests/normalize/utils.py b/tests/normalize/utils.py index 3ee14948c1..0ce099d4b6 100644 --- a/tests/normalize/utils.py +++ b/tests/normalize/utils.py @@ -1,10 +1,10 @@ from typing import Mapping, cast -from dlt.destinations.duckdb import capabilities as duck_insert_caps -from dlt.destinations.redshift import capabilities as rd_insert_caps -from dlt.destinations.postgres import capabilities as pg_insert_caps -from dlt.destinations.bigquery import capabilities as jsonl_caps -from dlt.destinations.filesystem import capabilities as filesystem_caps +from dlt.destinations.impl.duckdb import capabilities as duck_insert_caps +from dlt.destinations.impl.redshift import capabilities as rd_insert_caps +from dlt.destinations.impl.postgres import capabilities as pg_insert_caps +from dlt.destinations.impl.bigquery import capabilities as jsonl_caps +from dlt.destinations.impl.filesystem import capabilities as filesystem_caps DEFAULT_CAPS = pg_insert_caps diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 09d8e98d82..2f383c1c0a 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -10,8 +10,8 @@ from dlt.common.storages import FileStorage from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME, TStoredSchema from dlt.common.configuration.resolve import resolve_configuration -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient from tests.utils import TEST_STORAGE_ROOT, test_storage diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index c778e47cd6..3fcb38d915 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -21,7 +21,9 @@ from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id +from dlt.common.schema import Schema +from dlt.destinations import filesystem, redshift, dummy from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage from dlt.extract.source import DltResource, DltSource @@ -173,7 +175,7 @@ def test_configured_destination(environment) -> None: p = dlt.pipeline() assert p.destination is not None - assert p.destination.__name__.endswith("postgres") + assert p.destination.name.endswith("postgres") assert p.pipeline_name == "postgres_pipe" @@ -228,6 +230,56 @@ def test_destination_explicit_credentials(environment: Any) -> None: assert config.credentials.is_resolved() +def test_destination_staging_config(environment: Any) -> None: + fs_dest = filesystem("file:///testing-bucket") + p = dlt.pipeline( + pipeline_name="staging_pipeline", + destination=redshift(credentials="redshift://loader:loader@localhost:5432/dlt_data"), + staging=fs_dest + ) + schema = Schema("foo") + p._inject_schema(schema) + initial_config = p._get_destination_client_initial_config(p.staging, as_staging=True) + staging_config = fs_dest.configuration(initial_config) # type: ignore[arg-type] + + # Ensure that as_staging flag is set in the final resolved conifg + assert staging_config.as_staging is True + + +def test_destination_factory_defaults_resolve_from_config(environment: Any) -> None: + """Params passed explicitly to destination supersede config values. + Env config values supersede default values. + """ + environment["FAIL_PROB"] = "0.3" + environment["RETRY_PROB"] = "0.8" + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=dummy(retry_prob=0.5)) + + client = p.destination_client() + + assert client.config.fail_prob == 0.3 # type: ignore[attr-defined] + assert client.config.retry_prob == 0.5 # type: ignore[attr-defined] + + +def test_destination_credentials_in_factory(environment: Any) -> None: + os.environ['DESTINATION__REDSHIFT__CREDENTIALS'] = "redshift://abc:123@localhost:5432/some_db" + + redshift_dest = redshift("redshift://abc:123@localhost:5432/other_db") + + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest) + + initial_config = p._get_destination_client_initial_config(p.destination) + dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type] + # Explicit factory arg supersedes config + assert dest_config.credentials.database == "other_db" + + redshift_dest = redshift() + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest) + + initial_config = p._get_destination_client_initial_config(p.destination) + dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type] + assert dest_config.credentials.database == "some_db" + + @pytest.mark.skip(reason="does not work on CI. probably takes right credentials from somewhere....") def test_destination_explicit_invalid_credentials_filesystem(environment: Any) -> None: # if string cannot be parsed diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 14b881eedc..0e8dea2145 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -48,8 +48,8 @@ def test_restore_state_props() -> None: assert state["destination"].endswith("redshift") assert state["staging"].endswith("filesystem") # also instances are restored - assert p.destination.__name__.endswith("redshift") - assert p.staging.__name__.endswith("filesystem") + assert p.destination.name.endswith("redshift") + assert p.staging.name.endswith("filesystem") def test_managed_state() -> None: diff --git a/tests/tools/clean_redshift.py b/tests/tools/clean_redshift.py index 7444d69685..27680b26cd 100644 --- a/tests/tools/clean_redshift.py +++ b/tests/tools/clean_redshift.py @@ -1,5 +1,5 @@ -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.sql_client import psycopg2 +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.sql_client import psycopg2 from psycopg2.errors import InsufficientPrivilege, InternalError_, SyntaxError CONNECTION_STRING = ""