-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support partitioning hints for athena iceberg
- Loading branch information
Showing
7 changed files
with
301 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from typing import Any, Optional, Dict, Protocol, Sequence, Union, Final | ||
|
||
from dateutil import parser | ||
|
||
from dlt.common.pendulum import timezone | ||
from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TColumnSchema | ||
from dlt.destinations.utils import ensure_resource | ||
from dlt.extract import DltResource | ||
from dlt.extract.items import TTableHintTemplate | ||
|
||
|
||
PARTITION_HINT: Final[str] = "x-athena-partition" | ||
|
||
|
||
class athena_partition: | ||
"""Helper class to generate iceberg partition transform strings. | ||
E.g. `athena_partition.bucket(16, "id")` will return `bucket(16, "id")`. | ||
""" | ||
|
||
@staticmethod | ||
def year(column_name: str) -> str: | ||
"""Partition by year part of a date or timestamp column.""" | ||
return f"year({column_name})" | ||
|
||
@staticmethod | ||
def month(column_name: str) -> str: | ||
"""Partition by month part of a date or timestamp column.""" | ||
return f"month({column_name})" | ||
|
||
@staticmethod | ||
def day(column_name: str) -> str: | ||
"""Partition by day part of a date or timestamp column.""" | ||
return f"day({column_name})" | ||
|
||
@staticmethod | ||
def hour(column_name: str) -> str: | ||
"""Partition by hour part of a date or timestamp column.""" | ||
return f"hour({column_name})" | ||
|
||
@staticmethod | ||
def bucket(n: int, column_name: str) -> str: | ||
"""Partition by hashed value to n buckets.""" | ||
return f"bucket({n}, {column_name})" | ||
|
||
@staticmethod | ||
def truncate(length: int, column_name: str) -> str: | ||
"""Partition by value truncated to length.""" | ||
return f"truncate({length}, {column_name})" | ||
|
||
|
||
def athena_adapter( | ||
data: Any, | ||
partition: Union[str, Sequence[str]] = None, | ||
) -> DltResource: | ||
""" | ||
Prepares data for loading into Athena | ||
Args: | ||
data: The data to be transformed. | ||
This can be raw data or an instance of DltResource. | ||
If raw data is provided, the function will wrap it into a `DltResource` object. | ||
partition: Column name(s) partition transform string(s) to partition table by | ||
Returns: | ||
A `DltResource` object that is ready to be loaded into BigQuery. | ||
Raises: | ||
ValueError: If any hint is invalid or none are specified. | ||
Examples: | ||
>>> data = [{"name": "Marcel", "department": "Engineering", "date_hired": "2024-01-30"}] | ||
>>> athena_adapter(data, partition=["department", athena_partition.year("date_hired"), athena_partition.bucket(8, "name")]) | ||
[DltResource with hints applied] | ||
""" | ||
resource = ensure_resource(data) | ||
additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} | ||
|
||
if partition: | ||
if isinstance(partition, str): | ||
partition = [partition] | ||
|
||
# Note: PARTITIONED BY clause identifiers are not allowed to be quoted. They are added as-is. | ||
additional_table_hints[PARTITION_HINT] = list(partition) | ||
|
||
if additional_table_hints: | ||
resource.apply_hints(additional_table_hints=additional_table_hints) | ||
else: | ||
raise ValueError("A value for `partition` must be specified.") | ||
return resource |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import pytest | ||
|
||
import dlt | ||
from dlt.destinations import filesystem | ||
from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition | ||
from tests.load.utils import destinations_configs, DestinationTestConfiguration | ||
|
||
|
||
def test_iceberg_partition_hints(): | ||
"""Create a table with athena partition hints and check that the SQL is generated correctly.""" | ||
|
||
@dlt.resource(table_format="iceberg") | ||
def partitioned_table(): | ||
yield { | ||
"product_id": 1, | ||
"name": "product 1", | ||
"created_at": "2021-01-01T00:00:00Z", | ||
"category": "category 1", | ||
"price": 100.0, | ||
"quantity": 10, | ||
} | ||
|
||
@dlt.resource(table_format="iceberg") | ||
def not_partitioned_table(): | ||
yield {"a": 1, "b": 2} | ||
|
||
athena_adapter( | ||
partitioned_table, | ||
partition=[ | ||
"category", | ||
athena_partition.month("created_at"), | ||
athena_partition.bucket(10, "product_id"), | ||
athena_partition.truncate(2, "name"), | ||
], | ||
) | ||
|
||
pipeline = dlt.pipeline( | ||
"athena_test", | ||
destination="athena", | ||
staging=filesystem("s3://not-a-real-bucket"), | ||
full_refresh=True, | ||
) | ||
|
||
pipeline.extract([partitioned_table, not_partitioned_table]) | ||
pipeline.normalize() | ||
|
||
with pipeline._sql_job_client(pipeline.default_schema) as client: | ||
sql_partitioned = client._get_table_update_sql( | ||
"partitioned_table", | ||
list(pipeline.default_schema.tables["partitioned_table"]["columns"].values()), | ||
False, | ||
)[0] | ||
sql_not_partitioned = client._get_table_update_sql( | ||
"not_partitioned_table", | ||
list(pipeline.default_schema.tables["not_partitioned_table"]["columns"].values()), | ||
False, | ||
)[0] | ||
|
||
# Partition clause is generated with original order | ||
expected_clause = ( | ||
"PARTITIONED BY (category, month(created_at), bucket(10, product_id), truncate(2, name))" | ||
) | ||
assert expected_clause in sql_partitioned | ||
|
||
# No partition clause otherwise | ||
assert "PARTITIONED BY" not in sql_not_partitioned |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters