diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8a8cc37ae..85dbf37c97 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,6 +52,29 @@ We use **master** branch for hot fixes (including documentation) that needs to b On the release day, **devel** branch is merged into **master**. All releases of `dlt` happen only from the **master**. +### Branch naming rules + +We want to make sure that our git history explains in a human readable way what has been changed with which Branch or PR. To this end, we are using the following branch naming pattern (all lowercase and dashes, no underscores): + +```sh +{category}/{ticket-id}-description-of-the-branch +# example: +feat/4922-add-avro-support +``` + +#### Branch categories + +* **feat** - a new feature that is being implemented (ticket required) +* **fix** - a change that fixes a bug (ticket required) +* **exp** - an experiment where we are testing a new idea or want to demonstrate something to the team, might turn into a `feat` later (ticket encouraged) +* **test** - anything related to the tests (ticket encouraged) +* **blogs** - a new entry to our blog (ticket optional) +* **docs** - a change to our docs (ticket optional) + +#### Ticket Numbers + +We encourage you to attach your branches to a ticket, if none exists, create one and explain what you are doing. For `feat` and `fix` branches, tickets are mandatory, for `exp` and `test` branches encouraged and for `blogs` and `docs` branches optional. + ### Submitting a hotfix We'll fix critical bugs and release `dlt` out of the schedule. Follow the regular procedure, but make your PR against **master** branch. Please ping us on Slack if you do it. @@ -166,3 +189,4 @@ Once the version has been bumped, follow these steps to publish the new release - [Poetry Documentation](https://python-poetry.org/docs/) If you have any questions or need help, don't hesitate to reach out to us. We're here to help you succeed in contributing to `dlt`. Happy coding! +**** \ No newline at end of file diff --git a/dlt/common/configuration/__init__.py b/dlt/common/configuration/__init__.py index 8de57f7799..2abc31b17d 100644 --- a/dlt/common/configuration/__init__.py +++ b/dlt/common/configuration/__init__.py @@ -1,4 +1,10 @@ -from .specs.base_configuration import configspec, is_valid_hint, is_secret_hint, resolve_type +from .specs.base_configuration import ( + configspec, + is_valid_hint, + is_secret_hint, + resolve_type, + NotResolved, +) from .specs import known_sections from .resolve import resolve_configuration, inject_section from .inject import with_config, last_config, get_fun_spec, create_resolved_partial @@ -15,6 +21,7 @@ "configspec", "is_valid_hint", "is_secret_hint", + "NotResolved", "resolve_type", "known_sections", "resolve_configuration", diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index ebfa7b6b89..9101cfdd9c 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -8,7 +8,6 @@ StrAny, TSecretValue, get_all_types_of_class_in_union, - is_final_type, is_optional_type, is_union_type, ) @@ -21,6 +20,7 @@ is_context_inner_hint, is_base_configuration_inner_hint, is_valid_hint, + is_hint_not_resolved, ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.specs.exceptions import NativeValueError @@ -194,7 +194,7 @@ def _resolve_config_fields( if explicit_values: explicit_value = explicit_values.get(key) else: - if is_final_type(hint): + if is_hint_not_resolved(hint): # for final fields default value is like explicit explicit_value = default_value else: @@ -258,7 +258,7 @@ def _resolve_config_fields( unresolved_fields[key] = traces # set resolved value in config if default_value != current_value: - if not is_final_type(hint): + if not is_hint_not_resolved(hint): # ignore final types setattr(config, key, current_value) diff --git a/dlt/common/configuration/specs/__init__.py b/dlt/common/configuration/specs/__init__.py index 9acf14bde3..f1d7d819ff 100644 --- a/dlt/common/configuration/specs/__init__.py +++ b/dlt/common/configuration/specs/__init__.py @@ -20,7 +20,13 @@ from .connection_string_credentials import ConnectionStringCredentials from .api_credentials import OAuth2Credentials from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults -from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults +from .azure_credentials import ( + AzureCredentials, + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AnyAzureCredentials, +) # backward compatibility for service account credentials @@ -51,6 +57,9 @@ "AwsCredentialsWithoutDefaults", "AzureCredentials", "AzureCredentialsWithoutDefaults", + "AzureServicePrincipalCredentials", + "AzureServicePrincipalCredentialsWithoutDefaults", + "AnyAzureCredentials", "GcpClientCredentials", "GcpClientCredentialsWithDefault", ] diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 52d33ec0d3..8b8fc259f2 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue @@ -7,10 +7,6 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.exceptions import InvalidBoto3Session -from dlt import version - -import fsspec @configspec @@ -50,6 +46,22 @@ def on_partial(self) -> None: self.resolve() +@configspec +class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration): + azure_storage_account_name: str = None + azure_tenant_id: str = None + azure_client_id: str = None + azure_client_secret: TSecretStrValue = None + + def to_adlfs_credentials(self) -> Dict[str, Any]: + return dict( + account_name=self.azure_storage_account_name, + tenant_id=self.azure_tenant_id, + client_id=self.azure_client_id, + client_secret=self.azure_client_secret, + ) + + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): def on_partial(self) -> None: @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: if self.has_default_credentials(): base_kwargs["anon"] = False return base_kwargs + + +@configspec +class AzureServicePrincipalCredentials( + AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault +): + def on_partial(self) -> None: + from azure.identity import DefaultAzureCredential + + self._set_default_credentials(DefaultAzureCredential()) + if self.azure_storage_account_name: + self.resolve() + + def to_adlfs_credentials(self) -> Dict[str, Any]: + base_kwargs = super().to_adlfs_credentials() + if self.has_default_credentials(): + base_kwargs["anon"] = False + return base_kwargs + + +AnyAzureCredentials = Union[ + # Credentials without defaults come first because union types are attempted in order + # and explicit config should supersede system defaults + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentials, + AzureServicePrincipalCredentials, +] diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 1329feae6c..006cde8dce 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -20,7 +20,7 @@ ClassVar, TypeVar, ) -from typing_extensions import get_args, get_origin, dataclass_transform +from typing_extensions import get_args, get_origin, dataclass_transform, Annotated, TypeAlias from functools import wraps if TYPE_CHECKING: @@ -29,8 +29,11 @@ TDtcField = dataclasses.Field from dlt.common.typing import ( + AnyType, TAnyClass, extract_inner_type, + is_annotated, + is_final_type, is_optional_type, is_union_type, ) @@ -48,6 +51,34 @@ _C = TypeVar("_C", bound="CredentialsConfiguration") +class NotResolved: + """Used in type annotations to indicate types that should not be resolved.""" + + def __init__(self, not_resolved: bool = True): + self.not_resolved = not_resolved + + def __bool__(self) -> bool: + return self.not_resolved + + +def is_hint_not_resolved(hint: AnyType) -> bool: + """Checks if hint should NOT be resolved. Final and types annotated like + + >>> Annotated[str, NotResolved()] + + are not resolved. + """ + if is_final_type(hint): + return True + + if is_annotated(hint): + _, *a_m = get_args(hint) + for annotation in a_m: + if isinstance(annotation, NotResolved): + return bool(annotation) + return False + + def is_base_configuration_inner_hint(inner_hint: Type[Any]) -> bool: return inspect.isclass(inner_hint) and issubclass(inner_hint, BaseConfiguration) @@ -70,6 +101,11 @@ def is_valid_hint(hint: Type[Any]) -> bool: if get_origin(hint) is ClassVar: # class vars are skipped by dataclass return True + + if is_hint_not_resolved(hint): + # all hints that are not resolved are valid + return True + hint = extract_inner_type(hint) hint = get_config_if_union_hint(hint) or hint hint = get_origin(hint) or hint diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index fdd5b50111..bd32c68c49 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -55,7 +55,10 @@ def __init__( self.closed_files: List[DataWriterMetrics] = [] # all fully processed files # buffered items must be less than max items in file self.buffer_max_items = min(buffer_max_items, file_max_items or buffer_max_items) + # Explicitly configured max size supersedes destination limit self.file_max_bytes = file_max_bytes + if self.file_max_bytes is None and _caps: + self.file_max_bytes = _caps.recommended_file_size self.file_max_items = file_max_items # the open function is either gzip.open or open self.open = ( diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index e74f5a980d..089b4a1d5e 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -29,6 +29,8 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): preferred_loader_file_format: TLoaderFileFormat = None supported_loader_file_formats: Sequence[TLoaderFileFormat] = None + recommended_file_size: Optional[int] = None + """Recommended file size in bytes when writing extract/load files""" preferred_staging_file_format: Optional[TLoaderFileFormat] = None supported_staging_file_formats: Sequence[TLoaderFileFormat] = None escape_identifier: Callable[[str], str] = None diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 2ad5131e63..d4cdfb729d 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -18,8 +18,8 @@ Any, TypeVar, Generic, - Final, ) +from typing_extensions import Annotated import datetime # noqa: 251 from copy import deepcopy import inspect @@ -35,7 +35,7 @@ has_column_with_prop, get_first_column_name_with_prop, ) -from dlt.common.configuration import configspec, resolve_configuration, known_sections +from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext @@ -78,7 +78,7 @@ class StateInfo(NamedTuple): @configspec class DestinationClientConfiguration(BaseConfiguration): - destination_type: Final[str] = dataclasses.field( + destination_type: Annotated[str, NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False ) # which destination to load data to credentials: Optional[CredentialsConfiguration] = None @@ -103,11 +103,11 @@ def on_resolved(self) -> None: class DestinationClientDwhConfiguration(DestinationClientConfiguration): """Configuration of a destination that supports datasets/schemas""" - dataset_name: Final[str] = dataclasses.field( + dataset_name: Annotated[str, NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False - ) # dataset must be final so it is not configurable + ) # dataset cannot be resolved """dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix""" - default_schema_name: Final[Optional[str]] = dataclasses.field( + default_schema_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( default=None, init=False, repr=False, compare=False ) """name of default schema to be used to name effective dataset to load data to""" @@ -121,8 +121,8 @@ def _bind_dataset_name( This method is intended to be used internally. """ - self.dataset_name = dataset_name # type: ignore[misc] - self.default_schema_name = default_schema_name # type: ignore[misc] + self.dataset_name = dataset_name + self.default_schema_name = default_schema_name return self def normalize_dataset_name(self, schema: Schema) -> str: diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index a1838fab6e..6e100536af 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -10,8 +10,7 @@ GcpServiceAccountCredentials, AwsCredentials, GcpOAuthCredentials, - AzureCredentials, - AzureCredentialsWithoutDefaults, + AnyAzureCredentials, BaseConfiguration, ) from dlt.common.typing import DictStrAny @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration): FileSystemCredentials = Union[ - AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials + AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials ] @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration): "gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "s3": AwsCredentials, - "az": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials], + "az": AnyAzureCredentials, + "abfs": AnyAzureCredentials, + "adl": AnyAzureCredentials, } bucket_url: str = None diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index 554bd88924..1c3e094e19 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -5,6 +5,7 @@ from dlt.destinations.impl.bigquery import bigquery_adapter from dlt.destinations.impl.synapse import synapse_adapter from dlt.destinations.impl.clickhouse import clickhouse_adapter +from dlt.destinations.impl.athena import athena_adapter __all__ = [ "weaviate_adapter", @@ -12,4 +13,5 @@ "bigquery_adapter", "synapse_adapter", "clickhouse_adapter", + "athena_adapter", ] diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 7e1ab8fc27..8f043ba4d5 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -11,6 +11,7 @@ Callable, Iterable, Type, + cast, ) from copy import deepcopy import re @@ -69,6 +70,7 @@ from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration from dlt.destinations.type_mapping import TypeMapper from dlt.destinations import path_utils +from dlt.destinations.impl.athena.athena_adapter import PARTITION_HINT class AthenaTypeMapper(TypeMapper): @@ -405,6 +407,16 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" ) + def _iceberg_partition_clause(self, partition_hints: Optional[Dict[str, str]]) -> str: + if not partition_hints: + return "" + formatted_strings = [] + for column_name, template in partition_hints.items(): + formatted_strings.append( + template.format(column_name=self.sql_client.escape_ddl_identifier(column_name)) + ) + return f"PARTITIONED BY ({', '.join(formatted_strings)})" + def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: @@ -431,8 +443,12 @@ def _get_table_update_sql( sql.append(f"""ALTER TABLE {qualified_table_name} ADD COLUMNS ({columns});""") else: if is_iceberg: + partition_clause = self._iceberg_partition_clause( + cast(Optional[Dict[str, str]], table.get(PARTITION_HINT)) + ) sql.append(f"""CREATE TABLE {qualified_table_name} ({columns}) + {partition_clause} LOCATION '{location.rstrip('/')}' TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""") elif table_format == "jsonl": diff --git a/dlt/destinations/impl/athena/athena_adapter.py b/dlt/destinations/impl/athena/athena_adapter.py new file mode 100644 index 0000000000..cb600335c0 --- /dev/null +++ b/dlt/destinations/impl/athena/athena_adapter.py @@ -0,0 +1,117 @@ +from typing import Any, Optional, Dict, Protocol, Sequence, Union, Final + +from dateutil import parser + +from dlt.common.pendulum import timezone +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TColumnSchema +from dlt.destinations.utils import ensure_resource +from dlt.extract import DltResource +from dlt.extract.items import TTableHintTemplate + + +PARTITION_HINT: Final[str] = "x-athena-partition" + + +class PartitionTransformation: + template: str + """Template string of the transformation including column name placeholder. E.g. `bucket(16, {column_name})`""" + column_name: str + """Column name to apply the transformation to""" + + def __init__(self, template: str, column_name: str) -> None: + self.template = template + self.column_name = column_name + + +class athena_partition: + """Helper class to generate iceberg partition transformations + + E.g. `athena_partition.bucket(16, "id")` will return a transformation with template `bucket(16, {column_name})` + This can be correctly rendered by the athena loader with escaped column name. + """ + + @staticmethod + def year(column_name: str) -> PartitionTransformation: + """Partition by year part of a date or timestamp column.""" + return PartitionTransformation("year({column_name})", column_name) + + @staticmethod + def month(column_name: str) -> PartitionTransformation: + """Partition by month part of a date or timestamp column.""" + return PartitionTransformation("month({column_name})", column_name) + + @staticmethod + def day(column_name: str) -> PartitionTransformation: + """Partition by day part of a date or timestamp column.""" + return PartitionTransformation("day({column_name})", column_name) + + @staticmethod + def hour(column_name: str) -> PartitionTransformation: + """Partition by hour part of a date or timestamp column.""" + return PartitionTransformation("hour({column_name})", column_name) + + @staticmethod + def bucket(n: int, column_name: str) -> PartitionTransformation: + """Partition by hashed value to n buckets.""" + return PartitionTransformation(f"bucket({n}, {{column_name}})", column_name) + + @staticmethod + def truncate(length: int, column_name: str) -> PartitionTransformation: + """Partition by value truncated to length.""" + return PartitionTransformation(f"truncate({length}, {{column_name}})", column_name) + + +def athena_adapter( + data: Any, + partition: Union[ + str, PartitionTransformation, Sequence[Union[str, PartitionTransformation]] + ] = None, +) -> DltResource: + """ + Prepares data for loading into Athena + + Args: + data: The data to be transformed. + This can be raw data or an instance of DltResource. + If raw data is provided, the function will wrap it into a `DltResource` object. + partition: Column name(s) or instances of `PartitionTransformation` to partition the table by. + To use a transformation it's best to use the methods of the helper class `athena_partition` + to generate correctly escaped SQL in the loader. + + Returns: + A `DltResource` object that is ready to be loaded into BigQuery. + + Raises: + ValueError: If any hint is invalid or none are specified. + + Examples: + >>> data = [{"name": "Marcel", "department": "Engineering", "date_hired": "2024-01-30"}] + >>> athena_adapter(data, partition=["department", athena_partition.year("date_hired"), athena_partition.bucket(8, "name")]) + [DltResource with hints applied] + """ + resource = ensure_resource(data) + additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} + + if partition: + if isinstance(partition, str) or not isinstance(partition, Sequence): + partition = [partition] + + # Partition hint is `{column_name: template}`, e.g. `{"department": "{column_name}", "date_hired": "year({column_name})"}` + # Use one dict for all hints instead of storing on column so order is preserved + partition_hint: Dict[str, str] = {} + + for item in partition: + if isinstance(item, PartitionTransformation): + # Client will generate the final SQL string with escaped column name injected + partition_hint[item.column_name] = item.template + else: + # Item is the column name + partition_hint[item] = "{column_name}" + + additional_table_hints[PARTITION_HINT] = partition_hint + + if additional_table_hints: + resource.apply_hints(additional_table_hints=additional_table_hints) + else: + raise ValueError("A value for `partition` must be specified.") + return resource diff --git a/dlt/destinations/impl/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py index d33466ed5e..39322b43a0 100644 --- a/dlt/destinations/impl/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -12,6 +12,8 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supported_loader_file_formats = ["jsonl", "parquet"] caps.preferred_staging_file_format = "parquet" caps.supported_staging_file_formats = ["parquet", "jsonl"] + # BQ limit is 4GB but leave a large headroom since buffered writer does not preemptively check size + caps.recommended_file_size = int(1024 * 1024 * 1024) caps.escape_identifier = escape_bigquery_identifier caps.escape_literal = None caps.format_datetime_literal = format_bigquery_datetime_literal diff --git a/dlt/destinations/impl/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py index e9d9fe24fd..f7768d9238 100644 --- a/dlt/destinations/impl/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -17,7 +17,8 @@ def capabilities() -> DestinationCapabilitiesContext: # https://learn.microsoft.com/en-us/sql/sql-server/maximum-capacity-specifications-for-sql-server?view=sql-server-ver16&redirectedfrom=MSDN caps.max_identifier_length = 128 caps.max_column_identifier_length = 128 - caps.max_query_length = 4 * 1024 * 64 * 1024 + # A SQL Query can be a varchar(max) but is shown as limited to 65,536 * Network Packet + caps.max_query_length = 65536 * 10 caps.is_max_query_length_in_bytes = True caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 8de15e2bd9..6f364c8af1 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -181,7 +181,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non if c.get(h, False) is True ) column_name = self.capabilities.escape_identifier(c["name"]) - return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c['nullable'])}" + return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c.get('nullable', True))}" def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index d589537742..fd11cc7dcb 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,7 +1,8 @@ import dataclasses from typing import Optional, Final +from typing_extensions import Annotated -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import ( BaseConfiguration, CredentialsConfiguration, @@ -55,7 +56,9 @@ class QdrantClientConfiguration(DestinationClientDwhConfiguration): dataset_separator: str = "_" # make it optional so empty dataset is allowed - dataset_name: Final[Optional[str]] = dataclasses.field(default=None, init=False, repr=False, compare=False) # type: ignore[misc] + dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) # Batch size for generating embeddings embedding_batch_size: int = 32 diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 90fb7ce5ce..1a053e41f4 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -1,8 +1,9 @@ import dataclasses from typing import Dict, Literal, Optional, Final +from typing_extensions import Annotated from urllib.parse import urlparse -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration from dlt.common.destination.reference import DestinationClientDwhConfiguration from dlt.common.utils import digest128 @@ -26,7 +27,9 @@ def __str__(self) -> str: class WeaviateClientConfiguration(DestinationClientDwhConfiguration): destination_type: Final[str] = dataclasses.field(default="weaviate", init=False, repr=False, compare=False) # type: ignore # make it optional so empty dataset is allowed - dataset_name: Optional[str] = None # type: ignore[misc] + dataset_name: Annotated[Optional[str], NotResolved()] = dataclasses.field( + default=None, init=False, repr=False, compare=False + ) batch_size: int = 100 batch_workers: int = 1 diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index fac6391e01..9c4076cfa7 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -567,16 +567,13 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltResourceImpl: compat_wrapper(actual_resource_name, conf_f, sig, *args, **kwargs), incremental, ) - except InvalidResourceDataTypeFunctionNotAGenerator as gen_ex: + except InvalidResourceDataTypeFunctionNotAGenerator: # we allow an edge case: resource can return another resource - try: - # actually call the function to see if it contains DltResource - data_ = conf_f(*args, **kwargs) - if not isinstance(data_, DltResource): - raise - r = data_ # type: ignore[assignment] - except Exception: - raise gen_ex from None + # actually call the function to see if it contains DltResource + data_ = conf_f(*args, **kwargs) + if not isinstance(data_, DltResource): + raise + r = data_ # type: ignore[assignment] # consider transformer arguments bound r._args_bound = True # keep explicit args passed diff --git a/dlt/helpers/dbt/__init__.py b/dlt/helpers/dbt/__init__.py index b555bcd3a9..4801dcd6b9 100644 --- a/dlt/helpers/dbt/__init__.py +++ b/dlt/helpers/dbt/__init__.py @@ -11,7 +11,7 @@ from dlt.helpers.dbt.runner import create_runner, DBTPackageRunner -DEFAULT_DBT_VERSION = ">=1.1,<1.6" +DEFAULT_DBT_VERSION = ">=1.5,<1.9" # a map of destination names to dbt package names in case they don't match the pure destination name DBT_DESTINATION_MAP = { diff --git a/dlt/helpers/dbt/dbt_utils.py b/dlt/helpers/dbt/dbt_utils.py index bf14504eaa..80774d9858 100644 --- a/dlt/helpers/dbt/dbt_utils.py +++ b/dlt/helpers/dbt/dbt_utils.py @@ -24,7 +24,6 @@ # https://stackoverflow.com/questions/48619517/call-a-click-command-from-code import dbt.logger - from dbt.events import functions from dbt.contracts import results as dbt_results except ModuleNotFoundError: raise MissingDependencyException("DBT Core", ["dbt-core"]) @@ -56,17 +55,6 @@ def set_path_wrapper(self: dbt.logger.LogManager, path: str) -> None: self._file_handler.set_path(path) _DBT_LOGGER_INITIALIZED = True - # def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None: - # global _DBT_LOGGER_INITIALIZED - - # if not _DBT_LOGGER_INITIALIZED: - # functions.setup_event_logger(log_path, level.lower()) - # # force log level as file is debug only - # # functions.this.FILE_LOG.setLevel(level) - # # functions.this.FILE_LOG.handlers[0].setLevel(level) - # _DBT_LOGGER_INITIALIZED = True - - # dbt.main.setup_event_logger = setup_event_logger_wrapper dbt.logger.LogManager.set_path = set_path_wrapper # type: ignore globs = [] diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index 97cf23fdfc..b3fc2fbcd4 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -15,6 +15,9 @@ class LoaderConfiguration(PoolRunnerConfiguration): raise_on_max_retries: int = 5 """When gt 0 will raise when job reaches raise_on_max_retries""" _load_storage_config: LoadStorageConfiguration = None + # if set to `True`, the staging dataset will be + # truncated after loading the data + truncate_staging_dataset: bool = False def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "thread" diff --git a/dlt/load/load.py b/dlt/load/load.py index 66ddb1c308..9d898bc54d 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -53,7 +53,7 @@ LoadClientUnsupportedWriteDisposition, LoadClientUnsupportedFileFormats, ) -from dlt.load.utils import get_completed_table_chain, init_client +from dlt.load.utils import _extend_tables_with_table_chain, get_completed_table_chain, init_client class Load(Runnable[Executor], WithStepInfo[LoadMetrics, LoadInfo]): @@ -348,6 +348,8 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) ) ): job_client.complete_load(load_id) + self._maybe_trancate_staging_dataset(schema, job_client) + self.load_storage.complete_load_package(load_id, aborted) # collect package info self._loaded_packages.append(self.load_storage.get_load_package_info(load_id)) @@ -490,6 +492,37 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) + def _maybe_trancate_staging_dataset(self, schema: Schema, job_client: JobClientBase) -> None: + """ + Truncate the staging dataset if one used, + and configuration requests truncation. + + Args: + schema (Schema): Schema to use for the staging dataset. + job_client (JobClientBase): + Job client to use for the staging dataset. + """ + if not ( + isinstance(job_client, WithStagingDataset) and self.config.truncate_staging_dataset + ): + return + + data_tables = schema.data_table_names() + tables = _extend_tables_with_table_chain( + schema, data_tables, data_tables, job_client.should_load_data_to_staging_dataset + ) + + try: + with self.get_destination_client(schema) as client: + with client.with_staging_dataset(): # type: ignore + client.initialize_storage(truncate_tables=tables) + + except Exception as exc: + logger.warn( + f"Staging dataset truncate failed due to the following error: {exc}" + " However, it didn't affect the data integrity." + ) + def get_step_info( self, pipeline: SupportsPipeline, diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index a2ea1936a9..53770f332d 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -554,6 +554,7 @@ def load( with signals.delayed_signals(): runner.run_pool(load_step.config, load_step) info: LoadInfo = self._get_step_info(load_step) + self.first_run = False return info except Exception as l_ex: diff --git a/dlt/sources/helpers/requests/retry.py b/dlt/sources/helpers/requests/retry.py index c9a813598f..3f9d7d559e 100644 --- a/dlt/sources/helpers/requests/retry.py +++ b/dlt/sources/helpers/requests/retry.py @@ -239,7 +239,7 @@ def _make_session(self) -> Session: session.mount("http://", self._adapter) session.mount("https://", self._adapter) retry = _make_retry(**self._retry_kwargs) - session.request = retry.wraps(session.request) # type: ignore[method-assign] + session.send = retry.wraps(session.send) # type: ignore[method-assign] return session @property diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 37c0de3db1..29e6d8c77a 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -1,4 +1,5 @@ from base64 import b64encode +import dataclasses import math from typing import ( List, @@ -12,12 +13,13 @@ Iterable, TYPE_CHECKING, ) +from typing_extensions import Annotated from requests.auth import AuthBase -from requests import PreparedRequest # noqa: I251 +from requests import PreparedRequest, Session as BaseSession # noqa: I251 from dlt.common import logger from dlt.common.exceptions import MissingDependencyException -from dlt.common.configuration.specs.base_configuration import configspec +from dlt.common.configuration.specs.base_configuration import configspec, NotResolved from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.pendulum import pendulum @@ -38,7 +40,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - pass + def __bool__(self) -> bool: + # This is needed to avoid AuthConfigBase-derived classes + # which do not implement CredentialsConfiguration interface + # to be evaluated as False in requests.sessions.Session.prepare_request() + return True @configspec @@ -142,7 +148,9 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: class OAuthJWTAuth(BearerTokenAuth): """This is a form of Bearer auth, actually there's not standard way to declare it in openAPI""" - format: Final[Literal["JWT"]] = "JWT" # noqa: A003 + format: Final[Literal["JWT"]] = dataclasses.field( # noqa: A003 + default="JWT", init=False, repr=False, compare=False + ) client_id: str = None private_key: TSecretStrValue = None auth_endpoint: str = None @@ -150,11 +158,15 @@ class OAuthJWTAuth(BearerTokenAuth): headers: Optional[Dict[str, str]] = None private_key_passphrase: Optional[TSecretStrValue] = None default_token_expiration: int = 3600 + session: Annotated[BaseSession, NotResolved()] = None def __post_init__(self) -> None: self.scopes = self.scopes if isinstance(self.scopes, str) else " ".join(self.scopes) self.token = None self.token_expiry: Optional[pendulum.DateTime] = None + # use default system session is not specified + if self.session is None: + self.session = requests.client.session def __call__(self, r: PreparedRequest) -> PreparedRequest: if self.token is None or self.is_token_expired(): @@ -179,7 +191,7 @@ def obtain_token(self) -> None: logger.debug(f"Obtaining token from {self.auth_endpoint}") - response = requests.post(self.auth_endpoint, headers=self.headers, data=data) + response = self.session.post(self.auth_endpoint, headers=self.headers, data=data) response.raise_for_status() token_response = response.json() diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index b4b62fa849..dc7304f159 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -6,12 +6,14 @@ Any, TypeVar, Iterable, + Union, cast, ) import copy from urllib.parse import urlparse from requests import Session as BaseSession # noqa: I251 from requests import Response, Request +from requests.auth import AuthBase from dlt.common import jsonpath, logger @@ -41,7 +43,7 @@ def __init__( request: Request, response: Response, paginator: BasePaginator, - auth: AuthConfigBase, + auth: AuthBase, ): super().__init__(__iterable) self.request = request @@ -57,7 +59,7 @@ class RESTClient: Args: base_url (str): The base URL of the API to make requests to. headers (Optional[Dict[str, str]]): Default headers to include in all requests. - auth (Optional[AuthConfigBase]): Authentication configuration for all requests. + auth (Optional[AuthBase]): Authentication configuration for all requests. paginator (Optional[BasePaginator]): Default paginator for handling paginated responses. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for extracting data from responses. session (BaseSession): HTTP session for making requests. @@ -69,7 +71,7 @@ def __init__( self, base_url: str, headers: Optional[Dict[str, str]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, session: BaseSession = None, @@ -80,8 +82,9 @@ def __init__( self.auth = auth if session: - self._validate_session_raise_for_status(session) - self.session = session + # dlt.sources.helpers.requests.session.Session + # has raise_for_status=True by default + self.session = _warn_if_raise_for_status_and_return(session) else: self.session = Client(raise_for_status=False).session @@ -90,22 +93,13 @@ def __init__( self.data_selector = data_selector - def _validate_session_raise_for_status(self, session: BaseSession) -> None: - # dlt.sources.helpers.requests.session.Session - # has raise_for_status=True by default - if getattr(self.session, "raise_for_status", False): - logger.warning( - "The session provided has raise_for_status enabled. " - "This may cause unexpected behavior." - ) - def _create_request( self, path: str, method: HTTPMethod, params: Dict[str, Any], json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, hooks: Optional[Hooks] = None, ) -> Request: parsed_url = urlparse(path) @@ -154,7 +148,7 @@ def paginate( method: HTTPMethodBasic = "GET", params: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, hooks: Optional[Hooks] = None, @@ -166,7 +160,7 @@ def paginate( method (HTTPMethodBasic): HTTP method for the request, defaults to 'get'. params (Optional[Dict[str, Any]]): URL parameters for the request. json (Optional[Dict[str, Any]]): JSON payload for the request. - auth (Optional[AuthConfigBase]): Authentication configuration for the request. + auth (Optional[AuthBase): Authentication configuration for the request. paginator (Optional[BasePaginator]): Paginator instance for handling pagination logic. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for @@ -296,3 +290,12 @@ def detect_paginator(self, response: Response, data: Any) -> BasePaginator: " instance of the paginator as some settings may not be guessed correctly." ) return paginator + + +def _warn_if_raise_for_status_and_return(session: BaseSession) -> BaseSession: + """A generic function to warn if the session has raise_for_status enabled.""" + if getattr(session, "raise_for_status", False): + logger.warning( + "The session provided has raise_for_status enabled. This may cause unexpected behavior." + ) + return session diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py index 857f6bbb4e..19a1e83a82 100644 --- a/dlt/sources/helpers/rest_client/detector.py +++ b/dlt/sources/helpers/rest_client/detector.py @@ -1,5 +1,6 @@ import re -from typing import List, Dict, Any, Tuple, Union, Optional, Callable, Iterable +from pathlib import PurePosixPath +from typing import List, Dict, Any, Tuple, Union, Callable, Iterable from urllib.parse import urlparse from requests import Response @@ -25,6 +26,7 @@ "payload", "content", "objects", + "values", ] ) @@ -46,7 +48,10 @@ def single_entity_path(path: str) -> bool: """Checks if path ends with path param indicating that single object is returned""" - return re.search(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}/?$", path) is not None + # get last path segment + name = PurePosixPath(path).name + # alphabet for a name taken from https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.3.md#fixed-fields-6 + return re.search(r"\{([a-zA-Z0-9\.\-_]+)\}", name) is not None def matches_any_pattern(key: str, patterns: Iterable[str]) -> bool: diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index 125938ace5..380912a9a7 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -8,10 +8,10 @@ In this example, you'll find a Python script that demonstrates how to load to BigQuery with the custom destination. We'll learn how to: -- Use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) -- Use the [custom destination](../dlt-ecosystem/destinations/destination.md) -- Use pyarrow tables to create complex column types on BigQuery -- Use BigQuery `autodetect=True` for schema inference from parquet files +- Use [built-in credentials.](../general-usage/credentials/config_specs#gcp-credentials) +- Use the [custom destination.](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on BigQuery. +- Use BigQuery `autodetect=True` for schema inference from parquet files. """ @@ -60,7 +60,9 @@ def resource(url: str): # dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") +@dlt.destination( + name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case" +) def bigquery_insert( items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 76491578fe..93291bfe9a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -11,7 +11,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e ## Install dlt with Athena **To install the dlt library with Athena dependencies:** ```sh -pip install dlt[athena] +pip install "dlt[athena]" ``` ## Setup Guide @@ -30,7 +30,7 @@ First, install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. +or with `pip install "dlt[athena]"`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. :::caution @@ -161,5 +161,62 @@ aws_data_catalog="awsdatacatalog" You can choose the following file formats: * [parquet](../file-formats/parquet.md) is used by default + +## Athena adapter + +You can use the `athena_adapter` to add partitioning to Athena tables. This is currently only supported for Iceberg tables. + +Iceberg tables support a few transformation functions for partitioning. Info on all supported functions in the [AWS documentation](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html#querying-iceberg-creating-tables-query-editor). + +Use the `athena_partition` helper to generate the partitioning hints for these functions: + +* `athena_partition.year(column_name: str)`: Partition by year of date/datetime column. +* `athena_partition.month(column_name: str)`: Partition by month of date/datetime column. +* `athena_partition.day(column_name: str)`: Partition by day of date/datetime column. +* `athena_partition.hour(column_name: str)`: Partition by hour of date/datetime column. +* `athena_partition.bucket(n: int, column_name: str)`: Partition by hashed value to `n` buckets +* `athena_partition.truncate(length: int, column_name: str)`: Partition by truncated value to `length` (or width for numbers) + +Here is an example of how to use the adapter to partition a table: + +```py +from datetime import date + +import dlt +from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter + +data_items = [ + (1, "A", date(2021, 1, 1)), + (2, "A", date(2021, 1, 2)), + (3, "A", date(2021, 1, 3)), + (4, "A", date(2021, 2, 1)), + (5, "A", date(2021, 2, 2)), + (6, "B", date(2021, 1, 1)), + (7, "B", date(2021, 1, 2)), + (8, "B", date(2021, 1, 3)), + (9, "B", date(2021, 2, 1)), + (10, "B", date(2021, 3, 2)), +] + +@dlt.resource(table_format="iceberg") +def partitioned_data(): + yield [{"id": i, "category": c, "created_at": d} for i, c, d in data_items] + + +# Add partitioning hints to the table +athena_adapter( + partitioned_table, + partition=[ + # Partition per category and month + "category", + athena_partition.month("created_at"), + ], +) + + +pipeline = dlt.pipeline("athena_example") +pipeline.run(partitioned_data) +``` + diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 54d5abae6d..4f99901e37 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -11,7 +11,7 @@ keywords: [bigquery, destination, data warehouse] **To install the dlt library with BigQuery dependencies:** ```sh -pip install dlt[bigquery] +pip install "dlt[bigquery]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index ea187e54eb..58551751c5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -11,7 +11,7 @@ keywords: [ clickhouse, destination, data warehouse ] **To install the DLT library with ClickHouse dependencies:** ```sh -pip install dlt[clickhouse] +pip install "dlt[clickhouse]" ``` ## Setup Guide @@ -33,7 +33,7 @@ requirements file by executing it as follows: pip install -r requirements.txt ``` -or with `pip install dlt[clickhouse]`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. +or with `pip install "dlt[clickhouse]"`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. ### 2. Setup ClickHouse database diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index b601809935..6cd5767dcb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -12,7 +12,7 @@ keywords: [Databricks, destination, data warehouse] ## Install dlt with Databricks **To install the dlt library with Databricks dependencies:** ```sh -pip install dlt[databricks] +pip install "dlt[databricks]" ``` ## Set up your Databricks workspace diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index 0be01e8e32..546f470938 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -9,7 +9,7 @@ keywords: [dremio, iceberg, aws, glue catalog] ## Install dlt with Dremio **To install the dlt library with Dremio and s3 dependencies:** ```sh -pip install dlt[dremio,s3] +pip install "dlt[dremio,s3]" ``` ## Setup Guide @@ -28,7 +28,7 @@ First install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[dremio,s3]` which will install `s3fs`, `pyarrow`, and `botocore` packages. +or with `pip install "dlt[dremio,s3]"` which will install `s3fs`, `pyarrow`, and `botocore` packages. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url` which holds the uploaded parquet files. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 22c5fd1df9..c2f6786f8d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -9,7 +9,7 @@ keywords: [duckdb, destination, data warehouse] ## Install dlt with DuckDB **To install the dlt library with DuckDB dependencies, run:** ```sh -pip install dlt[duckdb] +pip install "dlt[duckdb]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 0d719b4cfa..9c7d961d3a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -6,7 +6,7 @@ The Filesystem destination stores data in remote file systems and bucket storage ## Install dlt with filesystem **To install the dlt library with filesystem dependencies:** ```sh -pip install dlt[filesystem] +pip install "dlt[filesystem]" ``` This installs `s3fs` and `botocore` packages. @@ -21,9 +21,7 @@ pip install s3fs so pip does not fail on backtracking. ::: -## Setup Guide - -### 1. Initialise the dlt project +## Initialise the dlt project Let's start by initializing a new dlt project as follows: ```sh @@ -33,9 +31,9 @@ Let's start by initializing a new dlt project as follows: This command will initialize your pipeline with chess as the source and the AWS S3 filesystem as the destination. ::: -### 2. Set up bucket storage and credentials +## Set up bucket storage and credentials -#### AWS S3 +### AWS S3 The command above creates a sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: ```sh pip install -r requirements.txt @@ -100,7 +98,7 @@ You need to create an S3 bucket and a user who can access that bucket. `dlt` doe 5. To grab the access and secret key for the user. Go to IAM > Users and in the “Security Credentials”, click on “Create Access Key”, and preferably select “Command Line Interface” and create the access key. 6. Grab the “Access Key” and “Secret Access Key” created that are to be used in "secrets.toml". -##### Using S3 compatible storage +#### Using S3 compatible storage To use an S3 compatible storage other than AWS S3 like [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: @@ -114,7 +112,7 @@ aws_secret_access_key = "please set me up!" # copy the secret access key here endpoint_url = "https://.r2.cloudflarestorage.com" # copy your endpoint URL here ``` -##### Adding Additional Configuration +#### Adding Additional Configuration To pass any additional arguments to `fsspec`, you may supply `kwargs` and `client_kwargs` in the config as a **stringified dictionary**: @@ -124,8 +122,8 @@ kwargs = '{"use_ssl": true, "auto_mkdir": true}' client_kwargs = '{"verify": "public.crt"}' ``` -#### Google Storage -Run `pip install dlt[gs]` which will install the `gcfs` package. +### Google Storage +Run `pip install "dlt[gs]"` which will install the `gcfs` package. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You'll see AWS credentials by default. @@ -147,10 +145,16 @@ if you have default google cloud credentials in your environment (i.e. on cloud Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. -#### Azure Blob Storage -Run `pip install dlt[az]` which will install the `adlfs` package to interface with Azure Blob Storage. +### Azure Blob Storage +Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. + +Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. + +Two forms of Azure credentials are supported: + +#### SAS token credentials -Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: +Supply storage account name and either sas token or storage account key ```toml [destination.filesystem] @@ -168,7 +172,21 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. -#### Local file system +#### Service principal credentials + +Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container + +```toml +[destination.filesystem] +bucket_url = "az://[your_container name]" # replace with your container name + +[destination.filesystem.credentials] +azure_client_id = "client_id" # please set me up! +azure_client_secret = "client_secret" +azure_tenant_id = "tenant_id" # please set me up! +``` + +### Local file system If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) ```toml @@ -176,10 +194,24 @@ If for any reason you want to have those files in a local folder, set up the `bu bucket_url = "file:///absolute/path" # three / for an absolute path ``` -`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may be not intuitive especially for Windows users. +:::tip +For handling deeply nested layouts, consider enabling automatic directory creation for the local filesystem destination. This can be done by setting `kwargs` in `secrets.toml`: ```toml [destination.filesystem] +kwargs = '{"auto_mkdir": true}' +``` + +Or by setting environment variable: +```sh +export DESTINATION__FILESYSTEM__KWARGS = '{"auto_mkdir": true/false}' +``` +::: + +`dlt` correctly handles the native local file paths. Indeed, using the `file://` schema may be not intuitive especially for Windows users. + +```toml +[destination.unc_destination] bucket_url = 'C:\a\b\c' ``` @@ -359,18 +391,17 @@ Please note: The filesystem destination configuration supports advanced layout customization and the inclusion of additional placeholders. This can be done through `config.toml` or programmatically when initializing via a factory method. -:::tip -For handling deeply nested layouts, consider enabling automatic directory creation for the local filesystem destination. This can be done by setting `kwargs = '{"auto_mkdir": true}'` to facilitate the creation of directories automatically. -::: - #### Configuration via `config.toml` To configure the layout and placeholders using `config.toml`, use the following format: ```toml +[destination.filesystem] layout = "{table_name}/{test_placeholder}/{YYYY}-{MM}-{DD}/{ddd}/{mm}/{load_id}.{file_id}.{ext}" extra_placeholders = { "test_placeholder" = "test_value" } current_datetime="2024-04-14T00:00:00" +# for automatic directory creation in the local filesystem +kwargs = '{"auto_mkdir": true}' ``` :::note diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index b053d29ac1..9d8c8d260b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -10,7 +10,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] ## Install dlt with MotherDuck **To install the dlt library with MotherDuck dependencies:** ```sh -pip install dlt[motherduck] +pip install "dlt[motherduck]" ``` :::tip diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 6554d24bf7..4a6f3d61df 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -9,7 +9,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] ## Install dlt with MS SQL **To install the dlt library with MS SQL dependencies, use:** ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` ## Setup guide @@ -38,7 +38,7 @@ pip install -r requirements.txt ``` or run: ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 5126272e37..ae504728c3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -9,7 +9,7 @@ keywords: [postgres, destination, data warehouse] ## Install dlt with PostgreSQL **To install the dlt library with PostgreSQL dependencies, run:** ```sh -pip install dlt[postgres] +pip install "dlt[postgres]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 1b560ad6fe..9f19007227 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -14,7 +14,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: ```sh -pip install dlt[qdrant] +pip install "dlt[qdrant]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 349698d201..7e0679ec6b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -9,7 +9,7 @@ keywords: [redshift, destination, data warehouse] ## Install dlt with Redshift **To install the dlt library with Redshift dependencies:** ```sh -pip install dlt[redshift] +pip install "dlt[redshift]" ``` ## Setup Guide @@ -26,7 +26,7 @@ The above command generates several files and directories, including `.dlt/secre ```sh pip install -r requirements.txt ``` -or with `pip install dlt[redshift]`, which installs the `dlt` library and the necessary dependencies for working with Amazon Redshift as a destination. +or with `pip install "dlt[redshift]"`, which installs the `dlt` library and the necessary dependencies for working with Amazon Redshift as a destination. ### 2. Setup Redshift cluster To load data into Redshift, you need to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index f144da02e6..deaaff3562 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -9,7 +9,7 @@ keywords: [Snowflake, destination, data warehouse] ## Install `dlt` with Snowflake **To install the `dlt` library with Snowflake dependencies, run:** ```sh -pip install dlt[snowflake] +pip install "dlt[snowflake]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index f1c43b4d54..2e936f193e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -9,7 +9,7 @@ keywords: [synapse, destination, data warehouse] ## Install dlt with Synapse **To install the dlt library with Synapse dependencies:** ```sh -pip install dlt[synapse] +pip install "dlt[synapse]" ``` ## Setup guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 1272b16c86..11d1276ceb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -14,7 +14,7 @@ This destination helps you load data into Weaviate from [dlt resources](../../ge 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: ```sh -pip install dlt[weaviate] +pip install "dlt[weaviate]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 8944b7d5fa..414eaf2cb8 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -11,7 +11,7 @@ keywords: [parquet, file formats] To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh -pip install dlt[parquet] +pip install "dlt[parquet]" ``` ## Supported Destinations diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md new file mode 100644 index 0000000000..a987a55b15 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/openapi-generator.md @@ -0,0 +1,210 @@ +--- +title: OpenAPI source generator +description: OpenAPI dlt source generator +keywords: [openapi, rest api, swagger, source generator, cli, rest] +--- +import Header from './_source-info-header.md'; + +# OpenAPI source generator + +
+ +Our OpenAPI source generator - `dlt-init-openapi` - generates [`dlt`](https://dlthub.com/docs) data pipelines from [OpenAPI 3.x specs](https://swagger.io/specification/) using the [rest_api verified source](./rest_api) to extract data from any REST API. If you are not familiar with the `rest_api` source, please read [rest_api](./rest_api) to learn how our `rest_api` source works. + +:::tip +We also have a cool [Google Colab example](https://colab.research.google.com/drive/1MRZvguOTZj1MlkEGzjiso8lQ_wr1MJRI?usp=sharing#scrollTo=LHGxzf1Ev_yr) that demonstrates this generator. 😎 +::: + +## Features +`dlt-init-openapi` generates code from an OpenAPI spec that you can use to extract data from a `rest_api` into any [`destination`](../destinations/) (e.g., Postgres, BigQuery, Redshift...) that `dlt` supports. dlt-init-openapi additionally executes a set of heuristics to discover information not explicitly defined in OpenAPI specs. + +Features include: + +* **[Pagination](./rest_api#pagination) discovery** for each endpoint. +* **Primary key discovery** for each entity. +* **Endpoint relationship mapping** into `dlt` [`transformers`](../../general-usage/resource#process-resources-with-dlttransformer) (e.g., /users/ -> /user/{id}). +* **Payload JSON path [data selector](./rest_api#data-selection) discovery** for results nested in the returned JSON. +* **[Authentication](./rest_api#authentication)** discovery for an API. + +## A quick example + +You will need Python 3.9 or higher installed, as well as pip. You can run `pip install dlt-init-openapi` to install the current version. + +We will create a simple example pipeline from a [PokeAPI spec](https://pokeapi.co/) in our repo. You can point to any other OpenAPI Spec instead if you prefer. + + +1. Run the generator with a URL: + ```sh + dlt-init-openapi pokemon --url https://raw.githubusercontent.com/dlt-hub/dlt-init-openapi/devel/tests/cases/e2e_specs/pokeapi.yml --global-limit 2 + ``` + +2. Alternatively, if you have a local file, you can use the --path flag: + ```sh + dlt-init-openapi pokemon --path ./my_specs/pokeapi.yml + ``` + +3. You can now pick both of the endpoints from the popup. + +4. After selecting your Pokemon endpoints and hitting Enter, your pipeline will be rendered. + +5. If you have any kind of authentication on your pipeline (this example does not), open the `.dlt/secrets.toml` and provide the credentials. You can find further settings in the `.dlt/config.toml`. + +6. Go to the created pipeline folder and run your pipeline. + ```sh + cd pokemon-pipeline + PROGRESS=enlighten python pipeline.py # we use enlighten for a nice progress bar :) + ``` + +7. Print the pipeline info to the console to see what got loaded. + ```sh + dlt pipeline pokemon_pipeline info + ``` + +8. You can now also install Streamlit to see a preview of the data; you should have loaded 40 Pokemons and their details. + ```sh + pip install pandas streamlit + dlt pipeline pokemon_pipeline show + ``` + +9. You can go to our docs at https://dlthub.com/docs to learn how to modify the generated pipeline to load to many destinations, place schema contracts on your pipeline, and many other things. + +:::note +We used the `--global-limit 2` CLI flag to limit the requests to the PokeAPI +for this example. This way, the Pokemon collection endpoint only gets queried +twice, resulting in 2 x 20 Pokemon details being rendered. +::: + +## What will be created? + +When you run the `dlt-init-openapi` command above, the following files will be generated: + +```text +pokemon_pipeline/ +├── .dlt/ +│ ├── config.toml # dlt config, learn more at dlthub.com/docs +│ └── secrets.toml # your secrets, only needed for APIs with auth +├── pokemon/ +│ └── __init__.py # your rest_api dictionary, learn more below +├── rest_api/ +│ └── ... # rest_api copied from our verified sources repo +├── .gitignore +├── pokemon_pipeline.py # your pipeline file that you can execute +├── README.md # a list of your endpoints with some additional info +└── requirements.txt # the pip requirements for your pipeline +``` + +:::warning +If you re-generate your pipeline, you will be prompted to continue if this folder exists. If you select yes, all generated files will be overwritten. All other files you may have created will remain in this folder. In non-interactive mode you will not be asked, and the generated files will be overwritten. +::: + +## A closer look at your `rest_api` dictionary in `pokemon/__init__.py` + +This file contains the [configuration dictionary](./rest_api#source-configuration) for the rest_api source which is the main result of running this generator. For our Pokemon example, we have used an OpenAPI 3 spec that works out of the box. The result of this dictionary depends on the quality of the spec you are using, whether the API you are querying actually adheres to this spec, and whether our heuristics manage to find the right values. + +The generated dictionary will look something like this: + +```py +{ + "client": { + "base_url": base_url, + # -> the detected common paginator + "paginator": { + ... + }, + }, + # -> your two endpoints + "resources": [ + { + # -> A primary key could not be inferred from + # the spec; usual suspects such as id, pokemon_id, etc. + # are not defined. You can add one if you know. + "name": "pokemon_list", + "table_name": "pokemon", + "endpoint": { + # -> the results seem to be nested in { results: [...] } + "data_selector": "results", + "path": "/api/v2/pokemon/", + }, + }, + { + "name": "pokemon_read", + "table_name": "pokemon", + # -> A primary key *name* is assumed, as it is found in the + # url. + "primary_key": "name", + "write_disposition": "merge", + "endpoint": { + "data_selector": "$", + "path": "/api/v2/pokemon/{name}/", + "params": { + # -> your detected transformer settings + # this is a child endpoint of the pokemon_list + "name": { + "type": "resolve", + "resource": "pokemon_list", + "field": "name", + }, + }, + }, + }, + ], +} +``` + +:::info +You can edit this file to adapt the behavior of the dlt rest_api accordingly. Please read our [dlt rest_api](./rest_api) docs to learn how to configure the rest_api source and check out our detailed [Google Colab example](https://colab.research.google.com/drive/1MRZvguOTZj1MlkEGzjiso8lQ_wr1MJRI?usp=sharing#scrollTo=LHGxzf1Ev_yr). +::: + +## CLI command + +```sh +dlt-init-openapi [OPTIONS] +``` + +### Example: +```sh +dlt-init-openapi pokemon --path ./path/to/my_spec.yml --no-interactive --output-path ./my_pipeline +``` + +**Options**: + +_The only required options are either to supply a path or a URL to a spec_ + +- `--url URL`: A URL to read the OpenAPI JSON or YAML file from. +- `--path PATH`: A path to read the OpenAPI JSON or YAML file from locally. +- `--output-path PATH`: A path to render the output to. +- `--config PATH`: Path to the config file to use (see below). +- `--no-interactive`: Skip endpoint selection and render all paths of the OpenAPI spec. +- `--log-level`: Set the logging level for stdout output, defaults to 20 (INFO). +- `--global-limit`: Set a global limit on the generated source. +- `--update-rest-api-source`: Update the locally cached rest_api verified source. +- `--allow-openapi-2`: Allows the use of OpenAPI v2. specs. Migration of the spec to 3.0 is recommended for better results though. +- `--version`: Show the installed version of the generator and exit. +- `--help`: Show this message and exit. + +## Config options +You can pass a path to a config file with the `--config PATH` argument. To see available config values, go to https://github.com/dlt-hub/dlt-init-openapi/blob/devel/dlt_init_openapi/config.py and read the information below each field on the `Config` class. + +The config file can be supplied as JSON or YAML dictionary. For example, to change the package name, you can create a YAML file: + +```yaml +# config.yml +package_name: "other_package_name" +``` + +And use it with the config argument: + +```sh +$ dlt-init-openapi pokemon --url ... --config config.yml +``` + +## Telemetry +We track your usage of this tool similar to how we track other commands in the dlt core library. Read more about this and how to disable it [here](../../reference/telemetry). + +## Prior work +This project started as a fork of [openapi-python-client](https://github.com/openapi-generators/openapi-python-client). Pretty much all parts are heavily changed or completely replaced, but some lines of code still exist, and we like to acknowledge the many good ideas we got from the original project :) + +## Implementation notes +* OAuth Authentication currently is not natively supported. You can supply your own. +* Per endpoint authentication currently is not supported by the generator. Only the first globally set securityScheme will be applied. You can add your own per endpoint if you need to. +* Basic OpenAPI 2.0 support is implemented. We recommend updating your specs at https://editor.swagger.io before using `dlt-init-openapi`. \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 0022850987..98725627b9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -282,7 +282,7 @@ The fields in the endpoint configuration are: - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. - `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. -- `response_actions`: A list of actions that define how to process the response data. +- `response_actions`: A list of actions that define how to process the response data. See the [response actions](#response-actions) section for more details. - `incremental`: Configuration for [incremental loading](#incremental-loading). ### Pagination @@ -414,8 +414,8 @@ Available authentication types: | Authentication class | String Alias (`type`) | Description | | ------------------- | ----------- | ----------- | | [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | -| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `api_key` | Basic HTTP authentication. | -| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `http_basic` | API key authentication with key defined in the query parameters or in the headers. | +| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `http_basic` | Basic HTTP authentication. | +| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `api_key` | API key authentication with key defined in the query parameters or in the headers. | To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: @@ -501,11 +501,13 @@ The syntax for the `resolve` field in parameter configuration is: "": { "type": "resolve", "resource": "", - "field": "", + "field": "", } } ``` +The `field` value can be specified as a [JSONPath](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to select a nested field in the parent resource data. For example: `"field": "items[0].id"`. + Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). #### Include fields from the parent resource @@ -586,3 +588,33 @@ See the [incremental loading](../../general-usage/incremental-loading.md#increme - `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. - `schema_contract`: Schema contract settings that will be applied to this resource. - `spec`: A specification of configuration and secret values required by the source. + +### Response actions + +The `response_actions` field in the endpoint configuration allows you to specify how to handle specific responses from the API based on status codes or content substrings. This is useful for handling edge cases like ignoring responses on specific conditions. + +:::caution Experimental Feature +This is an experimental feature and may change in future releases. +::: + +#### Example + +```py +{ + "path": "issues", + "response_actions": [ + {"status_code": 404, "action": "ignore"}, + {"content": "Not found", "action": "ignore"}, + {"status_code": 200, "content": "some text", "action": "ignore"}, + ], +} +``` + +In this example, the source will ignore responses with a status code of 404, responses with the content "Not found", and responses with a status code of 200 _and_ content "some text". + +**Fields:** + +- `status_code` (int, optional): The HTTP status code to match. +- `content` (str, optional): A substring to search for in the response content. +- `action` (str): The action to take when the condition is met. Currently supported actions: + - `ignore`: Ignore the response. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 970a891e60..38eda15c94 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -70,7 +70,7 @@ To get started with your data pipeline, follow these steps: [This command](../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/slack_pipeline.py) - with Google Sheets as the [source](../../general-usage/source) and + with Slack as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). 1. If you'd like to use a different destination, simply replace `duckdb` with the name of your diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index f8bd179422..82297420ed 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -247,7 +247,7 @@ API token. [destination](../../dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index ab71d3d1d0..f2cd4a1065 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -231,7 +231,7 @@ need to register to use this service neither get an API key. [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 3aadb2f982..2448d31a06 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -284,7 +284,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/destination-tables.md b/docs/website/docs/general-usage/destination-tables.md index 8e1f771e47..4780d4be20 100644 --- a/docs/website/docs/general-usage/destination-tables.md +++ b/docs/website/docs/general-usage/destination-tables.md @@ -74,7 +74,7 @@ pipeline = dlt.pipeline( load_info = pipeline.run(users) ``` -The result will be the same, but the table is implicitly named `users` based on the resource name. +The result will be the same; note that we do not explicitly pass `table_name="users"` to `pipeline.run`, and the table is implicitly named `users` based on the resource name (e.g., `users()` decorated with `@dlt.resource`). :::note @@ -117,9 +117,7 @@ pipeline = dlt.pipeline( load_info = pipeline.run(data, table_name="users") ``` -Running this pipeline will create two tables in the destination, `users` and `users__pets`. The -`users` table will contain the top level data, and the `users__pets` table will contain the child -data. Here is what the tables may look like: +Running this pipeline will create two tables in the destination, `users` and `users__pets`. The `users` table will contain the top-level data, and the `users__pets` table will contain the child data. Here is what the tables may look like: **mydata.users** @@ -141,21 +139,14 @@ creating and linking children and parent tables. This is how it works: -1. Each row in all (top level and child) data tables created by `dlt` contains UNIQUE column named - `_dlt_id`. -1. Each child table contains FOREIGN KEY column `_dlt_parent_id` linking to a particular row - (`_dlt_id`) of a parent table. -1. Rows in child tables come from the lists: `dlt` stores the position of each item in the list in - `_dlt_list_idx`. -1. For tables that are loaded with the `merge` write disposition, we add a ROOT KEY column - `_dlt_root_id`, which links child table to a row in top level table. - +1. Each row in all (top level and child) data tables created by `dlt` contains a `UNIQUE` column named `_dlt_id`. +1. Each child table contains a `FOREIGN KEY` column `_dlt_parent_id` linking to a particular row (`_dlt_id`) of a parent table. +1. Rows in child tables come from the lists: `dlt` stores the position of each item in the list in `_dlt_list_idx`. +1. For tables that are loaded with the `merge` write disposition, we add a root key column `_dlt_root_id`, which links the child table to a row in the top-level table. :::note -If you define your own primary key in a child table, it will be used to link to parent table -and the `_dlt_parent_id` and `_dlt_list_idx` will not be added. `_dlt_id` is always added even in -case the primary key or other unique columns are defined. +If you define your own primary key in a child table, it will be used to link to the parent table, and the `_dlt_parent_id` and `_dlt_list_idx` will not be added. `_dlt_id` is always added even if the primary key or other unique columns are defined. ::: @@ -164,17 +155,15 @@ case the primary key or other unique columns are defined. During a pipeline run, dlt [normalizes both table and column names](schema.md#naming-convention) to ensure compatibility with the destination database's accepted format. All names from your source data will be transformed into snake_case and will only include alphanumeric characters. Please be aware that the names in the destination database may differ somewhat from those in your original input. ### Variant columns -If your data has inconsistent types, `dlt` will dispatch the data to several **variant columns**. For example, if you have a resource (ie json file) with a filed with name **answer** and your data contains boolean values, you will get get a column with name **answer** of type **BOOLEAN** in your destination. If for some reason, on next load you get integer value and string value in **answer**, the inconsistent data will go to **answer__v_bigint** and **answer__v_text** columns respectively. -The general naming rule for variant columns is `__v_` where `original_name` is the existing column name (with data type clash) and `type` is the name of data type stored in the variant. - +If your data has inconsistent types, `dlt` will dispatch the data to several **variant columns**. For example, if you have a resource (i.e., JSON file) with a field with name `answer` and your data contains boolean values, you will get a column with name `answer` of type `BOOLEAN` in your destination. If for some reason, on the next load, you get integer and string values in `answer`, the inconsistent data will go to `answer__v_bigint` and `answer__v_text` columns respectively. +The general naming rule for variant columns is `__v_` where `original_name` is the existing column name (with data type clash) and `type` is the name of the data type stored in the variant. ## Load Packages and Load IDs Each execution of the pipeline generates one or more load packages. A load package typically contains data retrieved from all the [resources](glossary.md#resource) of a particular [source](glossary.md#source). These packages are uniquely identified by a `load_id`. The `load_id` of a particular package is added to the top data tables -(referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status 0 -(when the load process is fully completed). +(referenced as `_dlt_load_id` column in the example above) and to the special `_dlt_loads` table with a status of 0 (when the load process is fully completed). To illustrate this, let's load more data into the same destination: @@ -189,8 +178,7 @@ data = [ ``` The rest of the pipeline definition remains the same. Running this pipeline will create a new load -package with a new `load_id` and add the data to the existing tables. The `users` table will now -look like this: +package with a new `load_id` and add the data to the existing tables. The `users` table will now look like this: **mydata.users** @@ -210,12 +198,12 @@ The `_dlt_loads` table will look like this: | **1234563456.12345** | quick_start | 0 | 2023-09-12 16:46:03.10662+00 | aOEb...Qekd/58= | The `_dlt_loads` table tracks complete loads and allows chaining transformations on top of them. -Many destinations do not support distributed and long-running transactions (e.g. Amazon Redshift). +Many destinations do not support distributed and long-running transactions (e.g., Amazon Redshift). In that case, the user may see the partially loaded data. It is possible to filter such data out: any row with a `load_id` that does not exist in `_dlt_loads` is not yet completed. The same procedure may be used to identify and delete data for packages that never got completed. -For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g. +For each load, you can test and [alert](../running-in-production/alerting.md) on anomalies (e.g., no data, too much loaded to a table). There are also some useful load stats in the `Load info` tab of the [Streamlit app](../dlt-ecosystem/visualizations/exploring-the-data.md#exploring-the-data) mentioned above. @@ -231,8 +219,7 @@ Data lineage can be super relevant for architectures like the [data vault architecture](https://www.data-vault.co.uk/what-is-data-vault/) or when troubleshooting. The data vault architecture is a data warehouse that large organizations use when representing the same process across multiple systems, which adds data lineage requirements. Using the pipeline name -and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of -data. +and `load_id` provided out of the box by `dlt`, you are able to identify the source and time of data. You can [save](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) complete lineage info for a particular `load_id` including a list of loaded files, error messages @@ -242,11 +229,7 @@ problems. ## Staging dataset So far we've been using the `append` write disposition in our example pipeline. This means that -each time we run the pipeline, the data is appended to the existing tables. When you use [the -merge write disposition](incremental-loading.md), dlt creates a staging database schema for -staging data. This schema is named `_staging` and contains the same tables as the -destination schema. When you run the pipeline, the data from the staging tables is loaded into the -destination tables in a single atomic transaction. +each time we run the pipeline, the data is appended to the existing tables. When you use the [merge write disposition](incremental-loading.md), dlt creates a staging database schema for staging data. This schema is named `_staging` and contains the same tables as the destination schema. When you run the pipeline, the data from the staging tables is loaded into the destination tables in a single atomic transaction. Let's illustrate this with an example. We change our pipeline to use the `merge` write disposition: @@ -270,8 +253,7 @@ load_info = pipeline.run(users) ``` Running this pipeline will create a schema in the destination database with the name `mydata_staging`. -If you inspect the tables in this schema, you will find `mydata_staging.users` table identical to the -`mydata.users` table in the previous example. +If you inspect the tables in this schema, you will find the `mydata_staging.users` table identical to the`mydata.users` table in the previous example. Here is what the tables may look like after running the pipeline: @@ -290,8 +272,7 @@ Here is what the tables may look like after running the pipeline: | 2 | Bob 2 | rX8ybgTeEmAmmA | 2345672350.98417 | | 3 | Charlie | h8lehZEvT3fASQ | 1234563456.12345 | -Notice that the `mydata.users` table now contains the data from both the previous pipeline run and -the current one. +Notice that the `mydata.users` table now contains the data from both the previous pipeline run and the current one. ## Versioned datasets @@ -322,4 +303,4 @@ load_info = pipeline.run(data, table_name="users") Every time you run this pipeline, a new schema will be created in the destination database with a datetime-based suffix. The data will be loaded into tables in this schema. For example, the first time you run the pipeline, the schema will be named -`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. +`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. \ No newline at end of file diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index ca39046d35..1093428b0f 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -385,7 +385,7 @@ class PostBodyPaginator(BasePaginator): # Add the cursor to the request body request.json["cursor"] = self.cursor - + client = RESTClient( base_url="https://api.example.com", paginator=PostBodyPaginator() @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: ```py -from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from requests.auth import AuthBase -class CustomAuth(AuthConfigBase): +class CustomAuth(AuthBase): def __init__(self, token): self.token = token @@ -527,4 +527,70 @@ from dlt.sources.helpers.rest_client import paginate for page in paginate("https://api.example.com/posts"): print(page) -``` \ No newline at end of file +``` + +## Troubleshooting + +### `RESTClient.get()` and `RESTClient.post()` methods + +These methods work similarly to the [get()](https://docs.python-requests.org/en/latest/api/#requests.get) and [post()](https://docs.python-requests.org/en/latest/api/#requests.post) functions +from the Requests library. They return a [Response](https://docs.python-requests.org/en/latest/api/#requests.Response) object that contains the response data. +You can inspect the `Response` object to get the `response.status_code`, `response.headers`, and `response.content`. For example: + +```py +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + +client = RESTClient(base_url="https://api.example.com") +response = client.get("/posts", auth=BearerTokenAuth(token="your_access_token")) # type: ignore + +print(response.status_code) +print(response.headers) +print(response.content) +``` + +### `RESTClient.paginate()` + +Debugging `paginate()` is trickier because it's a generator function that yields [`PageData`](#pagedata) objects. Here's several ways to debug the `paginate()` method: + +1. Enable [logging](../../running-in-production/running.md#set-the-log-level-and-format) to see detailed information about the HTTP requests: + +```sh +RUNTIME__LOG_LEVEL=INFO python my_script.py +``` + +2. Use the [`PageData`](#pagedata) instance to inspect the [request](https://docs.python-requests.org/en/latest/api/#requests.Request) +and [response](https://docs.python-requests.org/en/latest/api/#requests.Response) objects: + +```py +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator + +client = RESTClient( + base_url="https://api.example.com", + paginator=JSONResponsePaginator(next_url_path="pagination.next") +) + +for page in client.paginate("/posts"): + print(page.request) + print(page.response) +``` + +3. Use the `hooks` parameter to add custom response handlers to the `paginate()` method: + +```py +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth + +def response_hook(response, **kwargs): + print(response.status_code) + print(f"Content: {response.content}") + print(f"Request: {response.request.body}") + # Or import pdb; pdb.set_trace() to debug + +for page in client.paginate( + "/posts", + auth=BearerTokenAuth(token="your_access_token"), # type: ignore + hooks={"response": [response_hook]} +): + print(page) +``` diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 776329bcf4..0374802b7d 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -32,6 +32,10 @@ The library will create or update tables, infer data types, and handle nested da ]}> +:::tip +Looking to use a REST API as a source? Explore our new [REST API generic source](dlt-ecosystem/verified-sources/rest_api) for a declarative way to load data. +::: + diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 253a27d942..9c52f58caa 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -108,6 +108,12 @@ behind. In `config.toml`: load.delete_completed_jobs=true ``` +Also, by default, `dlt` leaves data in staging dataset, used during merge and replace load for deduplication. In order to clear it, put the following line in `config.toml`: + +```toml +load.truncate_staging_dataset=true +``` + ## Using slack to send messages `dlt` provides basic support for sending slack messages. You can configure Slack incoming hook via diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 3a05f7940c..3ba95b7971 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -14,6 +14,9 @@ This tutorial continues the [previous](load-data-from-an-api) part. We'll use th In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepare to load comments from the API as well. Here's a sample [dlt resource](../general-usage/resource) that does that: ```py +import dlt +from dlt.sources.helpers.rest_client import paginate + @dlt.resource( table_name="comments", write_disposition="merge", @@ -22,17 +25,11 @@ In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepar def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = "https://api.github.com/repos/dlt-hub/dlt/comments?per_page=100" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={"per_page": 100} + ): + yield page ``` We can load this resource separately from the issues resource, however loading both issues and comments in one go is more efficient. To do that, we'll use the `@dlt.source` decorator on a function that returns a list of resources: @@ -47,7 +44,7 @@ def github_source(): ```py import dlt -from dlt.sources.helpers import requests +from dlt.sources.helpers.rest_client import paginate @dlt.resource( table_name="issues", @@ -57,21 +54,17 @@ from dlt.sources.helpers import requests def get_issues( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/issues" - f"?since={updated_at.last_value}&per_page=100" - "&sort=updated&directions=desc&state=open" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "directions": "desc", + "state": "open", + } + ): + yield page @dlt.resource( @@ -82,20 +75,14 @@ def get_issues( def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/comments" - "?per_page=100" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={ + "since": updated_at.last_value, + "per_page": 100, + } + ): + yield page @dlt.source @@ -124,18 +111,8 @@ from dlt.sources.helpers import requests BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" def fetch_github_data(endpoint, params={}): - """Fetch data from GitHub API based on endpoint and params.""" url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate(url, params=params) @dlt.source def github_source(): @@ -164,21 +141,16 @@ For the next step we'd want to get the [number of repository clones](https://doc Let's handle this by changing our `fetch_github_data()` first: ```py -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +def fetch_github_data(endpoint, params={}, access_token=None): url = f"{BASE_GITHUB_URL}/{endpoint}" + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] @dlt.source def github_source(access_token): @@ -229,28 +201,7 @@ access_token = "ghp_A...3aRY" Now we can run the script and it will load the data from the `traffic/clones` endpoint: ```py -import dlt -from dlt.sources.helpers import requests - -BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" - - -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - - url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - +... @dlt.source def github_source( @@ -287,19 +238,12 @@ BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" def fetch_github_data(repo_name, endpoint, params={}, access_token=None): """Fetch data from GitHub API based on repo_name, endpoint, and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) @dlt.source @@ -347,5 +291,6 @@ Interested in learning more? Here are some suggestions: - [Pass config and credentials into your sources and resources](../general-usage/credentials). - [Run in production: inspecting, tracing, retry policies and cleaning up](../running-in-production/running). - [Run resources in parallel, optimize buffers and local storage](../reference/performance.md) + - [Use REST API client helpers](../general-usage/http/rest-client.md) to simplify working with REST APIs. 3. Check out our [how-to guides](../walkthroughs) to get answers to some common questions. 4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 31a2c1592d..ec6136b6d3 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -44,7 +44,7 @@ dlt pipeline github_issues show ## Append or replace your data -Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have a new folder created daily with `json` file logs, and you want to ingest them. +Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have daily data updates and you want to ingest them. To get the latest data, we'd need to run the script again. But how to do that without duplicating the data? One option is to tell `dlt` to replace the data in existing tables in the destination by using `replace` write disposition. Change the `github_issues.py` script to the following: @@ -148,6 +148,55 @@ and `updated_at.last_value` to tell GitHub to return issues updated only **after [Learn more about merge write disposition](../general-usage/incremental-loading#merge-incremental_loading). +## Using pagination helper + +In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has the built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll pick the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. + +Here's how the updated script looks: + +```py +import dlt +from dlt.sources.helpers.rest_client import paginate + +@dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", +) +def get_issues( + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") +): + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "direction": "desc", + "state": "open", + }, + ): + yield page + +pipeline = dlt.pipeline( + pipeline_name="github_issues_merge", + destination="duckdb", + dataset_name="github_data_merge", +) +load_info = pipeline.run(get_issues) +row_counts = pipeline.last_trace.last_normalize_info + +print(row_counts) +print("------") +print(load_info) +``` + +Let's zoom in on the changes: + +1. The `while` loop that handled pagination is replaced with reading pages from the `paginate()` generator. +2. `paginate()` takes the URL of the API endpoint and optional parameters. In this case, we pass the `since` parameter to get only issues updated after the last pipeline run. +3. We're not explicitly setting up pagination, `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). + ## Next steps Continue your journey with the [Resource Grouping and Secrets](grouping-resources) tutorial. diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index cfe2d056b0..b0a9a9ce05 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -121,6 +121,32 @@ Do not rename the tables or columns in the yaml file. `dlt` infers those from th You can [adjust the schema](../general-usage/resource.md#adjust-schema) in Python before resource is loaded. ::: +### Reorder columns +To reorder the columns in your dataset, follow these steps: + +1. Initial Run: Execute the pipeline to obtain the import and export schemas. +1. Modify Export Schema: Adjust the column order as desired in the export schema. +1. Sync Import Schema: Ensure that these changes are mirrored in the import schema to maintain consistency. +1. Delete Dataset: Remove the existing dataset to prepare for the reload. +1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import YAML. + +These steps ensure that the column order in your dataset matches your specifications. + +**Another approach** to reorder columns is to use the `add_map` function. For instance, to rearrange ‘column1’, ‘column2’, and ‘column3’, you can proceed as follows: + +```py +# Define the data source and reorder columns using add_map +data_source = resource().add_map(lambda row: { + 'column3': row['column3'], + 'column1': row['column1'], + 'column2': row['column2'] +}) + +# Run the pipeline +load_info = pipeline.run(data_source) +``` + +In this example, the `add_map` function reorders columns by defining a new mapping. The lambda function specifies the desired order by rearranging the key-value pairs. When the pipeline runs, the data will load with the columns in the new order. ### Load data as json instead of generating child table or columns from flattened dicts diff --git a/docs/website/docs/walkthroughs/create-a-pipeline.md b/docs/website/docs/walkthroughs/create-a-pipeline.md index 1d5974efbe..cbbbd73fc3 100644 --- a/docs/website/docs/walkthroughs/create-a-pipeline.md +++ b/docs/website/docs/walkthroughs/create-a-pipeline.md @@ -1,31 +1,46 @@ --- title: Create a pipeline description: How to create a pipeline -keywords: [how to, create a pipeline] +keywords: [how to, create a pipeline, rest client] --- # Create a pipeline -Follow the steps below to create a [pipeline](../general-usage/glossary.md#pipeline) from the -WeatherAPI.com API to DuckDB from scratch. The same steps can be repeated for any source and -destination of your choice—use `dlt init ` and then build the pipeline for -that API instead. +This guide walks you through creating a pipeline that uses our [REST API Client](../general-usage/http/rest-client) +to connect to [DuckDB](../dlt-ecosystem/destinations/duckdb). +:::tip +We're using DuckDB as a destination here, but you can adapt the steps to any [source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) and [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/) by +using the [command](../reference/command-line-interface#dlt-init) `dlt init ` and tweaking the pipeline accordingly. +::: -Please make sure you have [installed `dlt`](../reference/installation.md) before following the +Please make sure you have [installed `dlt`](../reference/installation) before following the steps below. +## Task overview + +Imagine you want to analyze issues from a GitHub project locally. +To achieve this, you need to write code that accomplishes the following: + +1. Constructs a correct request. +2. Authenticates your request. +3. Fetches and handles paginated issue data. +4. Stores the data for analysis. + +This may sound complicated, but dlt provides a [REST API Client](../general-usage/http/rest-client) that allows you to focus more on your data rather than on managing API interactions. + + ## 1. Initialize project Create a new empty directory for your `dlt` project by running: ```sh -mkdir weatherapi_duckdb && cd weatherapi_duckdb +mkdir github_api_duckdb && cd github_api_duckdb ``` Start a `dlt` project with a pipeline template that loads data to DuckDB by running: ```sh -dlt init weatherapi duckdb +dlt init github_api duckdb ``` Install the dependencies necessary for DuckDB: @@ -34,114 +49,127 @@ Install the dependencies necessary for DuckDB: pip install -r requirements.txt ``` -## 2. Add WeatherAPI.com API credentials +## 2. Obtain and add API credentials from GitHub -You will need to [sign up for the WeatherAPI.com API](https://www.weatherapi.com/signup.aspx). +You will need to [sign in](https://github.com/login) to your GitHub account and create your access token via [Personal access tokens page](https://github.com/settings/tokens). -Once you do this, you should see your `API Key` at the top of your -[user page](https://www.weatherapi.com/my/). - -Copy the value of the API key into `.dlt/secrets.toml`: +Copy your new access token over to `.dlt/secrets.toml`: ```toml [sources] api_secret_key = '' ``` -The **secret name** corresponds to the **argument name** in the source function. Below `api_secret_key` [will get its value](../general-usage/credentials/configuration.md#general-usage-and-an-example) from `secrets.toml` when `weatherapi_source()` is called. + +This token will be used by `github_api_source()` to authenticate requests. + +The **secret name** corresponds to the **argument name** in the source function. +Below `api_secret_key` [will get its value](../general-usage/credentials/configuration#allow-dlt-to-pass-the-config-and-secrets-automatically) +from `secrets.toml` when `github_api_source()` is called. + ```py @dlt.source -def weatherapi_source(api_secret_key=dlt.secrets.value): - ... +def github_api_source(api_secret_key: str = dlt.secrets.value): + return github_api_resource(api_secret_key=api_secret_key) ``` -Run the `weatherapi.py` pipeline script to test that authentication headers look fine: +Run the `github_api.py` pipeline script to test that authentication headers look fine: ```sh -python3 weatherapi.py +python github_api.py ``` Your API key should be printed out to stdout along with some test data. -## 3. Request data from the WeatherAPI.com API +## 3. Request project issues from then GitHub API -Replace the definition of the `weatherapi_resource` function definition in the `weatherapi.py` -pipeline script with a call to the WeatherAPI.com API: -```py -@dlt.resource(write_disposition="append") -def weatherapi_resource(api_secret_key=dlt.secrets.value): - url = "https://api.weatherapi.com/v1/current.json" - params = { - "q": "NYC", - "key": api_secret_key - } - response = requests.get(url, params=params) - response.raise_for_status() - yield response.json() -``` +:::tip +We will use `dlt` repository as an example GitHub project https://github.com/dlt-hub/dlt, feel free to replace it with your own repository. +::: -Run the `weatherapi.py` pipeline script to test that the API call works: +Modify `github_api_resource` in `github_api.py` to request issues data from your GitHub project's API: -```sh -python3 weatherapi.py +```py +from dlt.sources.helpers.rest_client import paginate +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +from dlt.sources.helpers.rest_client.paginators import HeaderLinkPaginator + +@dlt.resource(write_disposition="replace") +def github_api_resource(api_secret_key: str = dlt.secrets.value): + url = "https://api.github.com/repos/dlt-hub/dlt/issues" + + for page in paginate( + url, + auth=BearerTokenAuth(api_secret_key), # type: ignore + paginator=HeaderLinkPaginator(), + params={"state": "open"} + ): + yield page ``` -This should print out the weather in New York City right now. - ## 4. Load the data -Remove the `exit()` call from the `main` function in `weatherapi.py`, so that running the -`python3 weatherapi.py` command will now also run the pipeline: +Uncomment the commented out code in `main` function in `github_api.py`, so that running the +`python github_api.py` command will now also run the pipeline: ```py if __name__=='__main__': - # configure the pipeline with your destination details pipeline = dlt.pipeline( - pipeline_name='weatherapi', + pipeline_name='github_api_pipeline', destination='duckdb', - dataset_name='weatherapi_data' + dataset_name='github_api_data' ) # print credentials by running the resource - data = list(weatherapi_resource()) + data = list(github_api_resource()) # print the data yielded from resource print(data) # run the pipeline with your parameters - load_info = pipeline.run(weatherapi_source()) + load_info = pipeline.run(github_api_source()) # pretty print the information on data that was loaded print(load_info) ``` -Run the `weatherapi.py` pipeline script to load data into DuckDB: + +Run the `github_api.py` pipeline script to test that the API call works: ```sh -python3 weatherapi.py +python github_api.py ``` -Then this command to see that the data loaded: +This should print out JSON data containing the issues in the GitHub project. + +It also prints `load_info` object. + +Let's explore the loaded data with the [command](../reference/command-line-interface#show-tables-and-data-in-the-destination) `dlt pipeline show`. + +:::info +Make sure you have `streamlit` installed `pip install streamlit` +::: ```sh -dlt pipeline weatherapi show +dlt pipeline github_api_pipeline show ``` This will open a Streamlit app that gives you an overview of the data loaded. ## 5. Next steps -Now that you have a working pipeline, you have options for what to learn next: +With a functioning pipeline, consider exploring: +- Our [REST Client](../general-usage/http/rest-client). - [Deploy this pipeline with GitHub Actions](deploy-a-pipeline/deploy-with-github-actions), so that the data is automatically loaded on a schedule. - Transform the [loaded data](../dlt-ecosystem/transformations) with dbt or in Pandas DataFrames. -- Learn how to [run](../running-in-production/running.md), - [monitor](../running-in-production/monitoring.md), and - [alert](../running-in-production/alerting.md) when you put your pipeline in production. +- Learn how to [run](../running-in-production/running), + [monitor](../running-in-production/monitoring), and + [alert](../running-in-production/alerting) when you put your pipeline in production. - Try loading data to a different destination like - [Google BigQuery](../dlt-ecosystem/destinations/bigquery.md), - [Amazon Redshift](../dlt-ecosystem/destinations/redshift.md), or - [Postgres](../dlt-ecosystem/destinations/postgres.md). + [Google BigQuery](../dlt-ecosystem/destinations/bigquery), + [Amazon Redshift](../dlt-ecosystem/destinations/redshift), or + [Postgres](../dlt-ecosystem/destinations/postgres). diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a3fe12c8fb..d3d7def8fc 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -84,6 +84,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/personio', 'dlt-ecosystem/verified-sources/pipedrive', 'dlt-ecosystem/verified-sources/rest_api', + 'dlt-ecosystem/verified-sources/openapi-generator', 'dlt-ecosystem/verified-sources/salesforce', 'dlt-ecosystem/verified-sources/scrapy', 'dlt-ecosystem/verified-sources/shopify', diff --git a/poetry.lock b/poetry.lock index dcab5e1730..6159f751c4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2548,106 +2548,58 @@ dates = ["pytz (>=2019.1)"] [[package]] name = "duckdb" -version = "0.9.2" -description = "DuckDB embedded database" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aadcea5160c586704c03a8a796c06a8afffbefefb1986601104a60cb0bfdb5ab"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:08215f17147ed83cbec972175d9882387366de2ed36c21cbe4add04b39a5bcb4"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee6c2a8aba6850abef5e1be9dbc04b8e72a5b2c2b67f77892317a21fae868fe7"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ff49f3da9399900fd58b5acd0bb8bfad22c5147584ad2427a78d937e11ec9d0"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5ac5baf8597efd2bfa75f984654afcabcd698342d59b0e265a0bc6f267b3f0"}, - {file = "duckdb-0.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81c6df905589a1023a27e9712edb5b724566587ef280a0c66a7ec07c8083623b"}, - {file = "duckdb-0.9.2-cp310-cp310-win32.whl", hash = "sha256:a298cd1d821c81d0dec8a60878c4b38c1adea04a9675fb6306c8f9083bbf314d"}, - {file = "duckdb-0.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:492a69cd60b6cb4f671b51893884cdc5efc4c3b2eb76057a007d2a2295427173"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:061a9ea809811d6e3025c5de31bc40e0302cfb08c08feefa574a6491e882e7e8"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a43f93be768af39f604b7b9b48891f9177c9282a408051209101ff80f7450d8f"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ac29c8c8f56fff5a681f7bf61711ccb9325c5329e64f23cb7ff31781d7b50773"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b14d98d26bab139114f62ade81350a5342f60a168d94b27ed2c706838f949eda"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:796a995299878913e765b28cc2b14c8e44fae2f54ab41a9ee668c18449f5f833"}, - {file = "duckdb-0.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6cb64ccfb72c11ec9c41b3cb6181b6fd33deccceda530e94e1c362af5f810ba1"}, - {file = "duckdb-0.9.2-cp311-cp311-win32.whl", hash = "sha256:930740cb7b2cd9e79946e1d3a8f66e15dc5849d4eaeff75c8788d0983b9256a5"}, - {file = "duckdb-0.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:c28f13c45006fd525001b2011cdf91fa216530e9751779651e66edc0e446be50"}, - {file = "duckdb-0.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fbce7bbcb4ba7d99fcec84cec08db40bc0dd9342c6c11930ce708817741faeeb"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15a82109a9e69b1891f0999749f9e3265f550032470f51432f944a37cfdc908b"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9490fb9a35eb74af40db5569d90df8a04a6f09ed9a8c9caa024998c40e2506aa"}, - {file = "duckdb-0.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:696d5c6dee86c1a491ea15b74aafe34ad2b62dcd46ad7e03b1d00111ca1a8c68"}, - {file = "duckdb-0.9.2-cp37-cp37m-win32.whl", hash = "sha256:4f0935300bdf8b7631ddfc838f36a858c1323696d8c8a2cecbd416bddf6b0631"}, - {file = "duckdb-0.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:0aab900f7510e4d2613263865570203ddfa2631858c7eb8cbed091af6ceb597f"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7d8130ed6a0c9421b135d0743705ea95b9a745852977717504e45722c112bf7a"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:974e5de0294f88a1a837378f1f83330395801e9246f4e88ed3bfc8ada65dcbee"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fbc297b602ef17e579bb3190c94d19c5002422b55814421a0fc11299c0c1100"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1dd58a0d84a424924a35b3772419f8cd78a01c626be3147e4934d7a035a8ad68"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11a1194a582c80dfb57565daa06141727e415ff5d17e022dc5f31888a5423d33"}, - {file = "duckdb-0.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:be45d08541002a9338e568dca67ab4f20c0277f8f58a73dfc1435c5b4297c996"}, - {file = "duckdb-0.9.2-cp38-cp38-win32.whl", hash = "sha256:dd6f88aeb7fc0bfecaca633629ff5c986ac966fe3b7dcec0b2c48632fd550ba2"}, - {file = "duckdb-0.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:28100c4a6a04e69aa0f4a6670a6d3d67a65f0337246a0c1a429f3f28f3c40b9a"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ae5bf0b6ad4278e46e933e51473b86b4b932dbc54ff097610e5b482dd125552"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5d0bb845a80aa48ed1fd1d2d285dd352e96dc97f8efced2a7429437ccd1fe1f"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ce262d74a52500d10888110dfd6715989926ec936918c232dcbaddb78fc55b4"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6935240da090a7f7d2666f6d0a5e45ff85715244171ca4e6576060a7f4a1200e"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5cfb93e73911696a98b9479299d19cfbc21dd05bb7ab11a923a903f86b4d06e"}, - {file = "duckdb-0.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64e3bc01751f31e7572d2716c3e8da8fe785f1cdc5be329100818d223002213f"}, - {file = "duckdb-0.9.2-cp39-cp39-win32.whl", hash = "sha256:6e5b80f46487636368e31b61461940e3999986359a78660a50dfdd17dd72017c"}, - {file = "duckdb-0.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:e6142a220180dbeea4f341708bd5f9501c5c962ce7ef47c1cadf5e8810b4cb13"}, - {file = "duckdb-0.9.2.tar.gz", hash = "sha256:3843afeab7c3fc4a4c0b53686a4cc1d9cdbdadcbb468d60fef910355ecafd447"}, -] - -[[package]] -name = "duckdb" -version = "0.10.0" +version = "0.10.3" description = "DuckDB in-process database" optional = false python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd0ffb3fddef0f72a150e4d76e10942a84a1a0447d10907df1621b90d6668060"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3d709d5c7c1a12b5e10d0b05fa916c670cd2b50178e3696faa0cc16048a1745"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9114aa22ec5d591a20ce5184be90f49d8e5b5348ceaab21e102c54560d07a5f8"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a37877efadf39caf7cadde0f430fedf762751b9c54750c821e2f1316705a21"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87cbc9e1d9c3fc9f14307bea757f99f15f46843c0ab13a6061354410824ed41f"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0bfec79fed387201550517d325dff4fad2705020bc139d936cab08b9e845662"}, - {file = "duckdb-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c5622134d2d9796b15e09de810e450859d4beb46d9b861357ec9ae40a61b775c"}, - {file = "duckdb-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:089ee8e831ccaef1b73fc89c43b661567175eed0115454880bafed5e35cda702"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a05af63747f1d7021995f0811c333dee7316cec3b06c0d3e4741b9bdb678dd21"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:072d6eba5d8a59e0069a8b5b4252fed8a21f9fe3f85a9129d186a39b3d0aea03"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a77b85668f59b919042832e4659538337f1c7f197123076c5311f1c9cf077df7"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96a666f1d2da65d03199a977aec246920920a5ea1da76b70ae02bd4fb1ffc48c"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ec76a4262b783628d26612d184834852d9c92fb203e91af789100c17e3d7173"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:009dd9d2cdbd3b061a9efbdfc79f2d1a8377bcf49f1e5f430138621f8c083a6c"}, - {file = "duckdb-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:878f06766088090dad4a2e5ee0081555242b2e8dcb29415ecc97e388cf0cf8d8"}, - {file = "duckdb-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:713ff0a1fb63a6d60f454acf67f31656549fb5d63f21ac68314e4f522daa1a89"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9c0ee450dfedfb52dd4957244e31820feef17228da31af6d052979450a80fd19"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ff79b2ea9994398b545c0d10601cd73565fbd09f8951b3d8003c7c5c0cebc7cb"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6bdf1aa71b924ef651062e6b8ff9981ad85bec89598294af8a072062c5717340"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0265bbc8216be3ced7b377ba8847128a3fc0ef99798a3c4557c1b88e3a01c23"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d418a315a07707a693bd985274c0f8c4dd77015d9ef5d8d3da4cc1942fd82e0"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2828475a292e68c71855190b818aded6bce7328f79e38c04a0c75f8f1c0ceef0"}, - {file = "duckdb-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c3aaeaae2eba97035c65f31ffdb18202c951337bf2b3d53d77ce1da8ae2ecf51"}, - {file = "duckdb-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:c51790aaaea97d8e4a58a114c371ed8d2c4e1ca7cbf29e3bdab6d8ccfc5afc1e"}, - {file = "duckdb-0.10.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8af1ae7cc77a12206b6c47ade191882cc8f49f750bb3e72bb86ac1d4fa89926a"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa4f7e8e8dc0e376aeb280b83f2584d0e25ec38985c27d19f3107b2edc4f4a97"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28ae942a79fad913defa912b56483cd7827a4e7721f4ce4bc9025b746ecb3c89"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01b57802898091455ca2a32c1335aac1e398da77c99e8a96a1e5de09f6a0add9"}, - {file = "duckdb-0.10.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:52e1ad4a55fa153d320c367046b9500578192e01c6d04308ba8b540441736f2c"}, - {file = "duckdb-0.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:904c47d04095af745e989c853f0bfc0776913dfc40dfbd2da7afdbbb5f67fed0"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:184ae7ea5874f3b8fa51ab0f1519bdd088a0b78c32080ee272b1d137e2c8fd9c"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd33982ecc9bac727a032d6cedced9f19033cbad56647147408891eb51a6cb37"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f59bf0949899105dd5f8864cb48139bfb78454a8c017b8258ba2b5e90acf7afc"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:395f3b18948001e35dceb48a4423d574e38656606d033eef375408b539e7b076"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b8eb2b803be7ee1df70435c33b03a4598cdaf676cd67ad782b288dcff65d781"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31b2ddd331801064326c8e3587a4db8a31d02aef11332c168f45b3bd92effb41"}, - {file = "duckdb-0.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c8b89e76a041424b8c2026c5dc1f74b53fbbc6c6f650d563259885ab2e7d093d"}, - {file = "duckdb-0.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:79084a82f16c0a54f6bfb7ded5600400c2daa90eb0d83337d81a56924eaee5d4"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:79799b3a270dcd9070f677ba510f1e66b112df3068425691bac97c5e278929c7"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8fc394bfe3434920cdbcfbdd0ac3ba40902faa1dbda088db0ba44003a45318a"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c116605551b4abf5786243a59bcef02bd69cc51837d0c57cafaa68cdc428aa0c"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3191170c3b0a43b0c12644800326f5afdea00d5a4621d59dbbd0c1059139e140"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fee69a50eb93c72dc77e7ab1fabe0c38d21a52c5da44a86aa217081e38f9f1bd"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5f449e87dacb16b0d145dbe65fa6fdb5a55b2b6911a46d74876e445dd395bac"}, - {file = "duckdb-0.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4487d0df221b17ea4177ad08131bc606b35f25cfadf890987833055b9d10cdf6"}, - {file = "duckdb-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:c099ae2ff8fe939fda62da81704f91e2f92ac45e48dc0e37c679c9d243d01e65"}, - {file = "duckdb-0.10.0.tar.gz", hash = "sha256:c02bcc128002aa79e3c9d89b9de25e062d1096a8793bc0d7932317b7977f6845"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, + {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, + {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, + {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, + {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, + {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, + {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, + {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, + {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, + {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, + {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, + {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, + {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, ] [[package]] @@ -9317,11 +9269,11 @@ clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyar databricks = ["databricks-sql-connector"] dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-databricks", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] dremio = ["pyarrow"] -duckdb = ["duckdb", "duckdb"] +duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] gs = ["gcsfs"] -motherduck = ["duckdb", "duckdb", "pyarrow"] +motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] @@ -9335,4 +9287,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c206bfd3eab8f0c9349398c3c0ed251490bab96254327cb800d45807f05d2997" +content-hash = "605b9b04ed3ae8b71c41eaf532d7bc8ce4f8135ef00593b5f01a82debc3e14c8" diff --git a/pyproject.toml b/pyproject.toml index 4bd62ce03b..cc18c37353 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "dlt" -version = "0.4.11" +version = "0.4.12" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] -maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] +maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] readme = "README.md" license = "Apache-2.0" homepage = "https://github.com/dlt-hub" @@ -13,6 +13,7 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Topic :: Software Development :: Libraries", + "Typing :: Typed", "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows",] @@ -57,10 +58,12 @@ psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_i grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} pyarrow = {version = ">=12.0.0", optional = true} -duckdb = [ - {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, - {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} -] +duckdb = {version = ">=0.6.1,<0.11", optional = true} +# keep per-python version dependency as a reference +# duckdb = [ +# {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, +# {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} +# ] dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} dbt-bigquery = {version = ">=1.2.0", optional = true} diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 84b2d1893d..43ccdf856c 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -12,11 +12,12 @@ Optional, Type, Union, - TYPE_CHECKING, ) +from typing_extensions import Annotated from dlt.common import json, pendulum, Decimal, Wei from dlt.common.configuration.providers.provider import ConfigProvider +from dlt.common.configuration.specs.base_configuration import NotResolved, is_hint_not_resolved from dlt.common.configuration.specs.gcp_credentials import ( GcpServiceAccountCredentialsWithoutDefaults, ) @@ -917,6 +918,58 @@ def test_is_valid_hint() -> None: assert is_valid_hint(Wei) is True # any class type, except deriving from BaseConfiguration is wrong type assert is_valid_hint(ConfigFieldMissingException) is False + # but final and annotated types are not ok because they are not resolved + assert is_valid_hint(Final[ConfigFieldMissingException]) is True # type: ignore[arg-type] + assert is_valid_hint(Annotated[ConfigFieldMissingException, NotResolved()]) is True # type: ignore[arg-type] + assert is_valid_hint(Annotated[ConfigFieldMissingException, "REQ"]) is False # type: ignore[arg-type] + + +def test_is_not_resolved_hint() -> None: + assert is_hint_not_resolved(Final[ConfigFieldMissingException]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved()]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(True)]) is True + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(False)]) is False + assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, "REQ"]) is False + assert is_hint_not_resolved(str) is False + + +def test_not_resolved_hint() -> None: + class SentinelClass: + pass + + @configspec + class OptionalNotResolveConfiguration(BaseConfiguration): + trace: Final[Optional[SentinelClass]] = None + traces: Annotated[Optional[List[SentinelClass]], NotResolved()] = None + + c = resolve.resolve_configuration(OptionalNotResolveConfiguration()) + assert c.trace is None + assert c.traces is None + + s1 = SentinelClass() + s2 = SentinelClass() + + c = resolve.resolve_configuration(OptionalNotResolveConfiguration(s1, [s2])) + assert c.trace is s1 + assert c.traces[0] is s2 + + @configspec + class NotResolveConfiguration(BaseConfiguration): + trace: Final[SentinelClass] = None + traces: Annotated[List[SentinelClass], NotResolved()] = None + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration()) + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration(trace=s1)) + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(NotResolveConfiguration(traces=[s2])) + + c2 = resolve.resolve_configuration(NotResolveConfiguration(s1, [s2])) + assert c2.trace is s1 + assert c2.traces[0] is s2 def test_configspec_auto_base_config_derivation() -> None: diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py index 2cb440bde1..e6e377b7d0 100644 --- a/tests/common/data_writers/utils.py +++ b/tests/common/data_writers/utils.py @@ -1,5 +1,5 @@ import os -from typing import Type +from typing import Type, Optional from dlt.common.data_writers.buffered import BufferedDataWriter from dlt.common.data_writers.writers import TWriter, ALL_WRITERS @@ -18,8 +18,8 @@ def get_writer( writer: Type[TWriter], buffer_max_items: int = 10, - file_max_items: int = 10, - file_max_bytes: int = None, + file_max_items: Optional[int] = 10, + file_max_bytes: Optional[int] = None, disable_compression: bool = False, caps: DestinationCapabilitiesContext = None, ) -> BufferedDataWriter[TWriter]: diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py index 82b81a1cd7..b6da132de9 100644 --- a/tests/extract/data_writers/test_buffered_writer.py +++ b/tests/extract/data_writers/test_buffered_writer.py @@ -2,6 +2,7 @@ import pytest import time from typing import Iterator, Type +from uuid import uuid4 from dlt.common.data_writers.exceptions import BufferedDataWriterClosed from dlt.common.data_writers.writers import ( @@ -11,7 +12,7 @@ JsonlWriter, ALL_WRITERS, ) -from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.destination.capabilities import TLoaderFileFormat, DestinationCapabilitiesContext from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage @@ -330,3 +331,38 @@ def test_special_write_rotates(disable_compression: bool, writer_type: Type[Data metrics = writer.import_file( "tests/extract/cases/imported.any", DataWriterMetrics("", 1, 231, 0, 0) ) + + +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) +@pytest.mark.parametrize("writer_type", ALL_OBJECT_WRITERS) +def test_rotation_on_destination_caps_recommended_file_size( + disable_compression: bool, writer_type: Type[DataWriter] +) -> None: + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.recommended_file_size = int(250 * 1024) + columns = {"id": new_column("id", "text")} + with get_writer( + writer_type, + disable_compression=disable_compression, + buffer_max_items=100, + file_max_items=None, + file_max_bytes=None, + caps=caps, + ) as writer: + for i in range(8): + # Data chunk approximately 40kb serialized + items = [{"id": str(uuid4())} for _ in range(1000)] + writer.write_data_item(items, columns) + if i < 5: + assert not writer.closed_files + + if i > 5: + # We should have written atleast 250kb by now and have rotated the file + assert len(writer.closed_files) == 1 + + # Check the files that were written are all within the recommended size + 1 chunk + assert len(writer.closed_files) == 2 + for file in writer.closed_files: + assert file.file_size < caps.recommended_file_size + 1024 * 50 diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 5e85552d73..c6a675a8d3 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -880,6 +880,17 @@ def rv_resource(name: str): assert list(r) == [1, 2, 3] +def test_standalone_resource_returning_resource_exception() -> None: + @dlt.resource(standalone=True) + def rv_resource(uniq_name: str = dlt.config.value): + return dlt.resource([1, 2, 3], name=uniq_name, primary_key="value") + + # pass through of the exception in `rv_resource` when it returns, not yields + with pytest.raises(ConfigFieldMissingException) as conf_ex: + rv_resource() + assert conf_ex.value.fields == ["uniq_name"] + + def test_resource_rename_credentials_separation(): os.environ["SOURCES__TEST_DECORATORS__STANDALONE_SIGNATURE__SECRET_END"] = "5" assert list(standalone_signature(1)) == [1, 2, 3, 4] diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index 845800e47f..533d16c998 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -384,7 +384,17 @@ def dag_parallel(): with mock.patch("dlt.helpers.airflow_helper.logger.warn") as warn_mock: dag_def = dag_parallel() dag_def.test() - warn_mock.assert_called_once() + warn_mock.assert_has_calls( + [ + mock.call( + "The resource resource2 in task" + " mock_data_incremental_source_resource1-resource2 is using incremental loading" + " and may modify the state. Resources that modify the state should not run in" + " parallel within the single pipeline as the state will not be correctly" + " merged. Please use 'serialize' or 'parallel-isolated' modes instead." + ) + ] + ) def test_parallel_isolated_run(): diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index 5b9b07fcc5..a82345d732 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -43,16 +43,13 @@ def client() -> Iterator[PostgresClient]: PACKAGE_PARAMS = [ - # ("postgres", "1.1.3"), - # ("postgres", "1.2.4"), - # ("postgres", "1.3.2"), - # ("postgres", "1.4.0"), ("postgres", "1.5.2"), ("postgres", "1.6.13"), + ("postgres", "1.8.1"), ("postgres", None), - # ("snowflake", "1.4.0"), ("snowflake", "1.5.2"), ("snowflake", "1.6.13"), + ("snowflake", "1.8.1"), ("snowflake", None), ] PACKAGE_IDS = [ @@ -82,10 +79,10 @@ def test_infer_venv_deps() -> None: # provide version ranges requirements = _create_dbt_deps(["duckdb"], dbt_version=">3") # special duckdb dependency - assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.10.3"] # we do not validate version ranges, pip will do it and fail when creating venv requirements = _create_dbt_deps(["motherduck"], dbt_version="y") - assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.10.3"] def test_default_profile_name() -> None: diff --git a/tests/load/athena_iceberg/__init__.py b/tests/load/athena_iceberg/__init__.py index e69de29bb2..56e5d539c2 100644 --- a/tests/load/athena_iceberg/__init__.py +++ b/tests/load/athena_iceberg/__init__.py @@ -0,0 +1,4 @@ +from tests.utils import skip_if_not_active + + +skip_if_not_active("athena") diff --git a/tests/load/athena_iceberg/test_athena_adapter.py b/tests/load/athena_iceberg/test_athena_adapter.py new file mode 100644 index 0000000000..3144eb9cc9 --- /dev/null +++ b/tests/load/athena_iceberg/test_athena_adapter.py @@ -0,0 +1,69 @@ +import pytest + +import dlt +from dlt.destinations import filesystem +from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +def test_iceberg_partition_hints(): + """Create a table with athena partition hints and check that the SQL is generated correctly.""" + + @dlt.resource(table_format="iceberg") + def partitioned_table(): + yield { + "product_id": 1, + "name": "product 1", + "created_at": "2021-01-01T00:00:00Z", + "category": "category 1", + "price": 100.0, + "quantity": 10, + } + + @dlt.resource(table_format="iceberg") + def not_partitioned_table(): + yield {"a": 1, "b": 2} + + athena_adapter( + partitioned_table, + partition=[ + "category", + athena_partition.month("created_at"), + athena_partition.bucket(10, "product_id"), + athena_partition.truncate(2, "name"), + ], + ) + + pipeline = dlt.pipeline( + "athena_test", + destination="athena", + staging=filesystem("s3://not-a-real-bucket"), + full_refresh=True, + ) + + pipeline.extract([partitioned_table, not_partitioned_table]) + pipeline.normalize() + + with pipeline._sql_job_client(pipeline.default_schema) as client: + sql_partitioned = client._get_table_update_sql( + "partitioned_table", + list(pipeline.default_schema.tables["partitioned_table"]["columns"].values()), + False, + )[0] + sql_not_partitioned = client._get_table_update_sql( + "not_partitioned_table", + list(pipeline.default_schema.tables["not_partitioned_table"]["columns"].values()), + False, + )[0] + + # Partition clause is generated with original order + expected_clause = ( + "PARTITIONED BY (`category`, month(`created_at`), bucket(10, `product_id`), truncate(2," + " `name`))" + ) + assert expected_clause in sql_partitioned + + # No partition clause otherwise + assert "PARTITIONED BY" not in sql_not_partitioned diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index dbcdc5c23e..d3bb9eb5f5 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -11,14 +11,11 @@ from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.utils import skip_if_not_active from dlt.destinations.exceptions import DatabaseTerminalException # mark all tests as essential, do not remove pytestmark = pytest.mark.essential -skip_if_not_active("athena") - def test_iceberg() -> None: """ diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index 467ba55a4f..4ee2ec46db 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -1,15 +1,24 @@ -from typing import Dict +from typing import Dict, Optional from urllib.parse import parse_qs +from uuid import uuid4 import pytest +import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException -from dlt.common.configuration.specs import AzureCredentials -from tests.load.utils import ALL_FILESYSTEM_DRIVERS +from dlt.common.configuration.specs import ( + AzureCredentials, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, +) +from dlt.common.storages.configuration import FilesystemConfiguration +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET from tests.common.configuration.utils import environment from tests.utils import preserve_environ, autouse_test_storage +from dlt.common.storages.fsspec_filesystem import fsspec_from_config # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -18,6 +27,27 @@ pytest.skip("az filesystem driver not configured", allow_module_level=True) +@pytest.fixture +def az_service_principal_config() -> Optional[FilesystemConfiguration]: + """FS config with alternate azure credentials format if available in environment + + Working credentials of this type may be created as an app in Entra, which has + R/W/E access to the bucket (via ACL of particular container) + + """ + credentials = AzureServicePrincipalCredentialsWithoutDefaults( + azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str), + azure_client_id=dlt.config.get("tests.az_sp_client_id", str), + azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type] + azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str), + ) + # + credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal")) + cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials) + + return resolve_configuration(cfg) + + def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None: environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None: "sas_token": None, "anon": False, } + + +def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret" + environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id" + + config = resolve_configuration(AzureServicePrincipalCredentials()) + + assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + + assert config.to_adlfs_credentials() == { + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"], + "client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"], + "tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"], + } + + +def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None: + """Filesystem config resolves correct credentials type""" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas" + environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4()) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + + +def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = ( + "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + ) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] + assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + + +def test_azure_service_principal_fs_operations( + az_service_principal_config: Optional[FilesystemConfiguration], +) -> None: + """Test connecting to azure filesystem with service principal credentials""" + config = az_service_principal_config + fs, bucket = fsspec_from_config(config) + + fn = uuid4().hex + # Try some file ops to see if the credentials work + fs.touch(f"{bucket}/{fn}/{fn}") + files = fs.ls(f"{bucket}/{fn}") + assert f"{bucket}/{fn}/{fn}" in files + fs.delete(f"{bucket}/{fn}/{fn}") + fs.rmdir(f"{bucket}/{fn}") diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index ca962adb16..4519f1ea83 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -1,6 +1,7 @@ import posixpath import os from unittest import mock +from pathlib import Path import pytest @@ -117,16 +118,18 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non client, _, root_path, load_id1 = load_info layout = client.config.layout # this path will be kept after replace - job_2_load_1_path = posixpath.join( - root_path, - create_path( - layout, - NORMALIZED_FILES[1], - client.schema.name, - load_id1, - load_package_timestamp=timestamp, - extra_placeholders=client.config.extra_placeholders, - ), + job_2_load_1_path = Path( + posixpath.join( + root_path, + create_path( + layout, + NORMALIZED_FILES[1], + client.schema.name, + load_id1, + load_package_timestamp=timestamp, + extra_placeholders=client.config.extra_placeholders, + ), + ) ) with perform_load( @@ -135,16 +138,18 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non client, _, root_path, load_id2 = load_info # this one we expect to be replaced with - job_1_load_2_path = posixpath.join( - root_path, - create_path( - layout, - NORMALIZED_FILES[0], - client.schema.name, - load_id2, - load_package_timestamp=timestamp, - extra_placeholders=client.config.extra_placeholders, - ), + job_1_load_2_path = Path( + posixpath.join( + root_path, + create_path( + layout, + NORMALIZED_FILES[0], + client.schema.name, + load_id2, + load_package_timestamp=timestamp, + extra_placeholders=client.config.extra_placeholders, + ), + ) ) # First file from load1 remains, second file is replaced by load2 @@ -159,7 +164,7 @@ def test_replace_write_disposition(layout: str, default_buckets_env: str) -> Non for f in files: if f == INIT_FILE_NAME: continue - paths.append(posixpath.join(basedir, f)) + paths.append(Path(posixpath.join(basedir, f))) ls = set(paths) assert ls == {job_2_load_1_path, job_1_load_2_path} @@ -210,7 +215,7 @@ def test_append_write_disposition(layout: str, default_buckets_env: str) -> None ) for job in jobs2 ] - expected_files = sorted([posixpath.join(root_path, fn) for fn in expected_files]) + expected_files = sorted([Path(posixpath.join(root_path, fn)) for fn in expected_files]) # type: ignore[misc] paths = [] for basedir, _dirs, files in client.fs_client.walk( @@ -222,5 +227,5 @@ def test_append_write_disposition(layout: str, default_buckets_env: str) -> None for f in files: if f == INIT_FILE_NAME: continue - paths.append(posixpath.join(basedir, f)) + paths.append(Path(posixpath.join(basedir, f))) assert list(sorted(paths)) == expected_files diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3677765c9f..c069f88a15 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -12,10 +12,7 @@ from dlt.common import json, pendulum from dlt.common.configuration import resolve from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import ( - AzureCredentials, - AzureCredentialsWithoutDefaults, -) +from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None: config = FilesystemConfiguration(bucket_url="az://root") assert config.protocol == "az" # print(config.resolve_credentials_type()) - assert ( - config.resolve_credentials_type() - == Union[AzureCredentialsWithoutDefaults, AzureCredentials] - ) + assert config.resolve_credentials_type() == AnyAzureCredentials assert dict(config) == { "read_only": False, "bucket_url": "az://root", diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py index 8c034a066b..a5bb6efc0d 100644 --- a/tests/load/pipeline/test_athena.py +++ b/tests/load/pipeline/test_athena.py @@ -9,6 +9,8 @@ from tests.pipeline.utils import assert_load_info, load_table_counts from tests.pipeline.utils import load_table_counts from dlt.destinations.exceptions import CantExtractTablePrefix +from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter +from dlt.destinations.fs_client import FSClientBase from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from tests.load.utils import ( @@ -231,3 +233,69 @@ def test_athena_file_layouts(destination_config: DestinationTestConfiguration, l pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] ) assert table_counts == {"items1": 3, "items2": 7} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["athena"], force_iceberg=True), + ids=lambda x: x.name, +) +def test_athena_partitioned_iceberg_table(destination_config: DestinationTestConfiguration): + """Load an iceberg table with partition hints and verifiy partitions are created correctly.""" + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + + data_items = [ + (1, "A", datetime.date.fromisoformat("2021-01-01")), + (2, "A", datetime.date.fromisoformat("2021-01-02")), + (3, "A", datetime.date.fromisoformat("2021-01-03")), + (4, "A", datetime.date.fromisoformat("2021-02-01")), + (5, "A", datetime.date.fromisoformat("2021-02-02")), + (6, "B", datetime.date.fromisoformat("2021-01-01")), + (7, "B", datetime.date.fromisoformat("2021-01-02")), + (8, "B", datetime.date.fromisoformat("2021-01-03")), + (9, "B", datetime.date.fromisoformat("2021-02-01")), + (10, "B", datetime.date.fromisoformat("2021-03-02")), + ] + + @dlt.resource(table_format="iceberg") + def partitioned_table(): + yield [{"id": i, "category": c, "created_at": d} for i, c, d in data_items] + + athena_adapter( + partitioned_table, + partition=[ + "category", + athena_partition.month("created_at"), + ], + ) + + info = pipeline.run(partitioned_table) + assert_load_info(info) + + # Get partitions from metadata + with pipeline.sql_client() as sql_client: + tbl_name = sql_client.make_qualified_table_name("partitioned_table$partitions") + rows = sql_client.execute_sql(f"SELECT partition FROM {tbl_name}") + partition_keys = {r[0] for r in rows} + + data_rows = sql_client.execute_sql( + "SELECT id, category, created_at FROM" + f" {sql_client.make_qualified_table_name('partitioned_table')}" + ) + # data_rows = [(i, c, d.toisoformat()) for i, c, d in data_rows] + + # All data is in table + assert len(data_rows) == len(data_items) + assert set(data_rows) == set(data_items) + + # Compare with expected partitions + # Months are number of months since epoch + expected_partitions = { + "{category=A, created_at_month=612}", + "{category=A, created_at_month=613}", + "{category=B, created_at_month=612}", + "{category=B, created_at_month=613}", + "{category=B, created_at_month=614}", + } + + assert partition_keys == expected_partitions diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 7680bc6e90..5f24daf57f 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -301,7 +301,7 @@ def count(*args, **kwargs) -> Any: for file in files: if ".jsonl" in file: - expected_files.add(posixpath.join(basedir, file)) + expected_files.add(Path(posixpath.join(basedir, file))) for load_package in load_info.load_packages: for load_info in load_package.jobs["completed_jobs"]: # type: ignore[assignment] @@ -321,7 +321,7 @@ def count(*args, **kwargs) -> Any: full_path = posixpath.join(client.dataset_path, path) # type: ignore[attr-defined] assert client.fs_client.exists(full_path) # type: ignore[attr-defined] if ".jsonl" in full_path: - known_files.add(full_path) + known_files.add(Path(full_path)) assert expected_files == known_files assert known_files diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index a498b570a0..d98f335d16 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -10,6 +10,7 @@ from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination from dlt.common.destination.exceptions import DestinationHasFailedJobs +from dlt.common.destination.reference import WithStagingDataset from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME @@ -896,6 +897,7 @@ def test_pipeline_upfront_tables_two_loads( # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy + os.environ["TRUNCATE_STAGING_DATASET"] = "True" pipeline = destination_config.setup_pipeline( "test_pipeline_upfront_tables_two_loads", @@ -1001,6 +1003,21 @@ def table_3(make_data=False): is True ) + job_client, _ = pipeline._get_destination_clients(schema) + + if destination_config.staging and isinstance(job_client, WithStagingDataset): + for i in range(1, 4): + with pipeline.sql_client() as client: + table_name = f"table_{i}" + + if job_client.should_load_data_to_staging_dataset( + job_client.schema.tables[table_name] + ): + with client.with_staging_dataset(staging=True): + tab_name = client.make_qualified_table_name(table_name) + with client.execute_query(f"SELECT * FROM {tab_name}") as cur: + assert len(cur.fetchall()) == 0 + # @pytest.mark.skip(reason="Finalize the test: compare some_data values to values from database") # @pytest.mark.parametrize( diff --git a/tests/load/utils.py b/tests/load/utils.py index 81107e83d9..e6b860c723 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -180,6 +180,7 @@ def destinations_configs( file_format: Union[TLoaderFileFormat, Sequence[TLoaderFileFormat]] = None, supports_merge: Optional[bool] = None, supports_dbt: Optional[bool] = None, + force_iceberg: Optional[bool] = None, ) -> List[DestinationTestConfiguration]: # sanity check for item in subset: @@ -495,6 +496,11 @@ def destinations_configs( conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS ] + if force_iceberg is not None: + destination_configs = [ + conf for conf in destination_configs if conf.force_iceberg is force_iceberg + ] + return destination_configs @@ -574,8 +580,8 @@ def yield_client( destination = Destination.from_reference(destination_type) # create initial config dest_config: DestinationClientDwhConfiguration = None - dest_config = destination.spec() # type: ignore[assignment] - dest_config.dataset_name = dataset_name # type: ignore[misc] + dest_config = destination.spec() # type: ignore + dest_config.dataset_name = dataset_name if default_config_values is not None: # apply the values to credentials, if dict is provided it will be used as default @@ -597,7 +603,7 @@ def yield_client( staging_config = DestinationClientStagingConfiguration( bucket_url=AWS_BUCKET, )._bind_dataset_name(dataset_name=dest_config.dataset_name) - staging_config.destination_type = "filesystem" # type: ignore[misc] + staging_config.destination_type = "filesystem" staging_config.resolve() dest_config.staging_config = staging_config # type: ignore[attr-defined] diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 11d3f13db9..8c3344f152 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -37,10 +37,10 @@ def drop_weaviate_schema() -> Iterator[None]: def get_client_instance(schema: Schema) -> WeaviateClient: - dest = weaviate(dataset_name="ClientTest" + uniq_id()) - return dest.client(schema, dest.spec()) - # with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): - # return dest.client(schema, config) + dest = weaviate() + return dest.client( + schema, dest.spec()._bind_dataset_name(dataset_name="ClientTest" + uniq_id()) + ) @pytest.fixture(scope="function") diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index a828de40fd..1c4383405b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -5,9 +5,9 @@ import logging import os import random +import threading from time import sleep from typing import Any, Tuple, cast -import threading from tenacity import retry_if_exception, Retrying, stop_after_attempt import pytest @@ -2230,3 +2230,33 @@ def stateful_resource(): assert len(fs_client.list_table_files("_dlt_loads")) == 2 assert len(fs_client.list_table_files("_dlt_version")) == 1 assert len(fs_client.list_table_files("_dlt_pipeline_state")) == 1 + + +@pytest.mark.parametrize("truncate", (True, False)) +def test_staging_dataset_truncate(truncate) -> None: + dlt.config["truncate_staging_dataset"] = truncate + + @dlt.resource(write_disposition="merge", merge_key="id") + def test_data(): + yield [{"field": 1, "id": 1}, {"field": 2, "id": 2}, {"field": 3, "id": 3}] + + pipeline = dlt.pipeline( + pipeline_name="test_staging_cleared", + destination="duckdb", + full_refresh=True, + ) + + info = pipeline.run(test_data, table_name="staging_cleared") + assert_load_info(info) + + with pipeline.sql_client() as client: + with client.execute_query( + f"SELECT * FROM {pipeline.dataset_name}_staging.staging_cleared" + ) as cur: + if truncate: + assert len(cur.fetchall()) == 0 + else: + assert len(cur.fetchall()) == 3 + + with client.execute_query(f"SELECT * FROM {pipeline.dataset_name}.staging_cleared") as cur: + assert len(cur.fetchall()) == 3 diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 50defa8edb..79a57d0e82 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,8 +1,11 @@ import os import pytest from typing import Any, cast +from dlt.common import logger +from requests import PreparedRequest, Request, Response +from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue -from dlt.sources.helpers.requests import Response, Request +from dlt.sources.helpers.requests import Client from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator @@ -33,6 +36,7 @@ def rest_client() -> RESTClient: return RESTClient( base_url="https://api.example.com", headers={"Accept": "application/json"}, + session=Client().session, ) @@ -57,7 +61,6 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), - auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -167,6 +170,7 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): auth_endpoint="https://api.example.com/oauth/token", scopes=["read", "write"], headers={"Content-Type": "application/json"}, + session=Client().session, ) response = rest_client.get( @@ -183,3 +187,57 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) + + def test_custom_session_client(self, mocker): + mocked_warning = mocker.patch.object(logger, "warning") + RESTClient( + base_url="https://api.example.com", + headers={"Accept": "application/json"}, + session=Client(raise_for_status=True).session, + ) + assert ( + mocked_warning.call_args[0][0] + == "The session provided has raise_for_status enabled. This may cause unexpected" + " behavior." + ) + + def test_custom_auth_success(self, rest_client: RESTClient): + class CustomAuthConfigBase(AuthConfigBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + class CustomAuthAuthBase(AuthBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + auth_list = [ + CustomAuthConfigBase("test-token"), + CustomAuthAuthBase("test-token"), + ] + + for auth in auth_list: + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + pages_list = list(pages_iter) + assert_pagination(pages_list) + + assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" diff --git a/tests/sources/helpers/rest_client/test_detector.py b/tests/sources/helpers/rest_client/test_detector.py index f01f9409a1..6511b472fb 100644 --- a/tests/sources/helpers/rest_client/test_detector.py +++ b/tests/sources/helpers/rest_client/test_detector.py @@ -406,16 +406,20 @@ def test_find_paginator(test_case) -> None: [ "/users/{user_id}", "/api/v1/products/{product_id}/", - # those are not valid paths - # "/api/v1/products/{product_id}//", - # "/api/v1/products/{product_id}?param1=value1", - # "/api/v1/products/{product_id}#section", - # "/api/v1/products/{product_id}/#section", + "/api/v1/products/{product_id}//", + "/api/v1/products/{product_id}?param1=value1", + "/api/v1/products/{product_id}#section", + "/api/v1/products/{product_id}.json", + "/api/v1/products/{product_id}.json/", + "/api/v1/products/{product_id}_data", + "/api/v1/products/{product_id}_data?param=true", "/users/{user_id}/posts/{post_id}", "/users/{user_id}/posts/{post_id}/comments/{comment_id}", "{entity}", "/{entity}", "/{user_123}", + "/users/{user-id}", + "/users/{123}", ], ) def test_single_entity_path_valid(path): @@ -430,8 +434,7 @@ def test_single_entity_path_valid(path): "/users/{user_id}/details", "/", "/{}", - "/users/{123}", - "/users/{user-id}", + "/api/v1/products/{product_id}/#section", "/users/{user id}", "/users/{user_id}/{", # Invalid ending ], diff --git a/tests/sources/helpers/test_requests.py b/tests/sources/helpers/test_requests.py index aefdf23e77..70776a50ee 100644 --- a/tests/sources/helpers/test_requests.py +++ b/tests/sources/helpers/test_requests.py @@ -1,4 +1,4 @@ -from typing import Iterator, Type +from typing import Any, Dict, Iterator, List, Type from unittest import mock import os import random @@ -29,7 +29,7 @@ def mock_sleep() -> Iterator[mock.MagicMock]: def test_default_session_retry_settings() -> None: - retry: Retrying = Client().session.request.retry # type: ignore + retry: Retrying = Client().session.send.retry # type: ignore assert retry.stop.max_attempt_number == 5 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries @@ -51,7 +51,7 @@ def custom_retry_cond(response, exception): respect_retry_after_header=False, ).session - retry: Retrying = session.request.retry # type: ignore + retry: Retrying = session.send.retry # type: ignore assert retry.stop.max_attempt_number == 14 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries @@ -63,11 +63,12 @@ def custom_retry_cond(response, exception): def test_retry_on_status_all_fails(mock_sleep: mock.MagicMock) -> None: session = Client().session url = "https://example.com/data" + m = requests_mock.Adapter() + session.mount("https://", m) + m.register_uri("GET", url, status_code=503) - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - with pytest.raises(requests.HTTPError): - session.get(url) + with pytest.raises(requests.HTTPError): + session.get(url) assert m.call_count == RunConfiguration.request_max_attempts @@ -76,6 +77,8 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: """Test successful request after 2 retries""" session = Client().session url = "https://example.com/data" + m = requests_mock.Adapter() + session.mount("https://", m) responses = [ dict(text="error", status_code=503), @@ -83,9 +86,8 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: dict(text="error", status_code=200), ] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - resp = session.get(url) + m.register_uri("GET", url, responses) + resp = session.get(url) assert resp.status_code == 200 assert m.call_count == 3 @@ -94,11 +96,12 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> None: url = "https://example.com/data" session = Client(raise_for_status=False).session + m = requests_mock.Adapter() + session.mount("https://", m) - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - response = session.get(url) - assert response.status_code == 503 + m.register_uri("GET", url, status_code=503) + response = session.get(url) + assert response.status_code == 503 assert m.call_count == RunConfiguration.request_max_attempts @@ -106,18 +109,19 @@ def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> def test_hooks_with_raise_for_statue() -> None: url = "https://example.com/data" session = Client(raise_for_status=True).session + m = requests_mock.Adapter() + session.mount("https://", m) def _no_content(resp: requests.Response, *args, **kwargs) -> requests.Response: resp.status_code = 204 resp._content = b"[]" return resp - with requests_mock.mock(session=session) as m: - m.get(url, status_code=503) - response = session.get(url, hooks={"response": _no_content}) - # we simulate empty response - assert response.status_code == 204 - assert response.json() == [] + m.register_uri("GET", url, status_code=503) + response = session.get(url, hooks={"response": _no_content}) + # we simulate empty response + assert response.status_code == 204 + assert response.json() == [] assert m.call_count == 1 @@ -130,12 +134,13 @@ def test_retry_on_exception_all_fails( exception_class: Type[Exception], mock_sleep: mock.MagicMock ) -> None: session = Client().session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - with requests_mock.mock(session=session) as m: - m.get(url, exc=exception_class) - with pytest.raises(exception_class): - session.get(url) + m.register_uri("GET", url, exc=exception_class) + with pytest.raises(exception_class): + session.get(url) assert m.call_count == RunConfiguration.request_max_attempts @@ -145,12 +150,13 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: return response.text == "error" session = Client(retry_condition=retry_on).session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - with requests_mock.mock(session=session) as m: - m.get(url, text="error") - response = session.get(url) - assert response.content == b"error" + m.register_uri("GET", url, text="error") + response = session.get(url) + assert response.content == b"error" assert m.call_count == RunConfiguration.request_max_attempts @@ -160,12 +166,12 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: return response.text == "error" session = Client(retry_condition=retry_on).session + m = requests_mock.Adapter() + session.mount("https://", m) url = "https://example.com/data" - responses = [dict(text="error"), dict(text="error"), dict(text="success")] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - resp = session.get(url) + m.register_uri("GET", url, [dict(text="error"), dict(text="error"), dict(text="success")]) + resp = session.get(url) assert resp.text == "success" assert m.call_count == 3 @@ -174,14 +180,16 @@ def retry_on(response: requests.Response, exception: BaseException) -> bool: def test_wait_retry_after_int(mock_sleep: mock.MagicMock) -> None: session = Client(request_backoff_factor=0).session url = "https://example.com/data" - responses = [ + m = requests_mock.Adapter() + session.mount("https://", m) + m.register_uri("GET", url, text="error") + responses: List[Dict[str, Any]] = [ dict(text="error", headers={"retry-after": "4"}, status_code=429), dict(text="success"), ] - with requests_mock.mock(session=session) as m: - m.get(url, responses) - session.get(url) + m.register_uri("GET", url, responses) + session.get(url) mock_sleep.assert_called_once() assert 4 <= mock_sleep.call_args[0][0] <= 5 # Adds jitter up to 1s @@ -206,7 +214,7 @@ def test_init_default_client(existing_session: bool) -> None: session = default_client.session assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] - retry = session.request.retry # type: ignore[attr-defined] + retry = session.send.retry # type: ignore[attr-defined] assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"] @@ -226,7 +234,7 @@ def test_client_instance_with_config(existing_session: bool) -> None: session = client.session assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] - retry = session.request.retry # type: ignore[attr-defined] + retry = session.send.retry # type: ignore[attr-defined] assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"]