-
Notifications
You must be signed in to change notification settings - Fork 185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
athena iceberg #659
athena iceberg #659
Changes from 13 commits
0e25102
26f9e41
e199bd1
b768a63
119ad6e
29a6d06
439c72f
b964388
2b5f004
7e82de7
202466f
bd9744c
f627a0f
9a94d4a
8682350
1560768
3f4fb1e
92613ec
0924bc5
122d035
7750318
0deecda
702fd4b
d70985d
95adc93
06dbaeb
baa5e44
ad8dc9b
00e474c
acfcd16
0707629
4692e37
10131e4
243246e
ba0c593
1e8605c
78fc17a
918c4d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,21 +16,21 @@ | |
from dlt.common.utils import without_none | ||
from dlt.common.data_types import TDataType | ||
from dlt.common.schema import TColumnSchema, Schema | ||
from dlt.common.schema.typing import TTableSchema, TColumnType | ||
from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition | ||
from dlt.common.schema.utils import table_schema_has_type | ||
from dlt.common.destination import DestinationCapabilitiesContext | ||
from dlt.common.destination.reference import LoadJob | ||
from dlt.common.destination.reference import TLoadJobState | ||
from dlt.common.destination.reference import LoadJob, FollowupJob | ||
from dlt.common.destination.reference import TLoadJobState, NewLoadJob | ||
from dlt.common.storages import FileStorage | ||
from dlt.common.data_writers.escape import escape_bigquery_identifier | ||
|
||
from dlt.destinations.sql_jobs import SqlStagingCopyJob | ||
|
||
from dlt.destinations.typing import DBApi, DBTransaction | ||
from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException | ||
from dlt.destinations.athena import capabilities | ||
from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error | ||
from dlt.destinations.typing import DBApiCursor | ||
from dlt.destinations.job_client_impl import SqlJobClientBase, StorageSchemaInfo | ||
from dlt.destinations.job_client_impl import SqlJobClientWithStaging | ||
from dlt.destinations.athena.configuration import AthenaClientConfiguration | ||
from dlt.destinations.type_mapping import TypeMapper | ||
from dlt.destinations import path_utils | ||
|
@@ -69,13 +69,18 @@ class AthenaTypeMapper(TypeMapper): | |
"int": "bigint", | ||
} | ||
|
||
def __init__(self, capabilities: DestinationCapabilitiesContext, iceberg_mode: bool): | ||
super().__init__(capabilities) | ||
self.iceberg_mode = iceberg_mode | ||
|
||
def to_db_integer_type(self, precision: Optional[int]) -> str: | ||
if precision is None: | ||
return "bigint" | ||
# iceberg does not support smallint and tinyint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: TIMESTAMP is precision 6 on iceberg, 3 on parquet |
||
if precision <= 8: | ||
return "tinyint" | ||
return "int" if self.iceberg_mode else "tinyint" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's why JobClient should create/modify table schema. so you can modify precision there and do not hack the type mapper... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't it be the cleanest to have a subclass for iceberg and then set that before the table sql is generated? I don't feel like changing the type mapper is hacking at all, that is what it is there for, changing the mapping of the types depending on database / table format you are storing into. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually we could extend the type mapper to have the info which table_format is currently being processed. that might be nice? |
||
elif precision <= 16: | ||
return "smallint" | ||
return "int" if self.iceberg_mode else "smallint" | ||
elif precision <= 32: | ||
return "int" | ||
return "bigint" | ||
|
@@ -135,6 +140,11 @@ def exception(self) -> str: | |
# this part of code should be never reached | ||
raise NotImplementedError() | ||
|
||
class DoNothingFollowupJob(DoNothingJob, FollowupJob): | ||
"""The second most lazy class of dlt""" | ||
pass | ||
|
||
|
||
class AthenaSQLClient(SqlClientBase[Connection]): | ||
|
||
capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() | ||
|
@@ -276,7 +286,7 @@ def has_dataset(self) -> bool: | |
return len(rows) > 0 | ||
|
||
|
||
class AthenaClient(SqlJobClientBase): | ||
class AthenaClient(SqlJobClientWithStaging): | ||
|
||
capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() | ||
|
||
|
@@ -293,11 +303,14 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: | |
super().__init__(schema, config, sql_client) | ||
self.sql_client: AthenaSQLClient = sql_client # type: ignore | ||
self.config: AthenaClientConfiguration = config | ||
self.type_mapper = AthenaTypeMapper(self.capabilities) | ||
self.iceberg_mode = not (not self.config.iceberg_bucket_url) | ||
self.type_mapper = AthenaTypeMapper(self.capabilities, self.iceberg_mode) | ||
|
||
def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: | ||
# never truncate tables in athena | ||
super().initialize_storage([]) | ||
# only truncate tables in iceberg mode | ||
if not self.iceberg_mode or self.in_staging_mode: | ||
truncate_tables = [] | ||
super().initialize_storage(truncate_tables) | ||
|
||
def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: | ||
return self.type_mapper.from_db_type(hive_t, precision, scale) | ||
|
@@ -307,12 +320,19 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: | |
|
||
def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: | ||
|
||
create_data_iceberg_tables = self.iceberg_mode and not self.in_staging_mode | ||
|
||
bucket = self.config.staging_config.bucket_url | ||
dataset = self.sql_client.dataset_name | ||
if create_data_iceberg_tables: | ||
bucket = self.config.iceberg_bucket_url | ||
|
||
# TODO: we need to strip the staging layout from the table name, find a better way! | ||
dataset = self.sql_client.dataset_name.replace("_staging", "") | ||
sql: List[str] = [] | ||
|
||
# for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries | ||
is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip" | ||
# or if we are in iceberg mode, we create iceberg tables for all tables | ||
is_iceberg = create_data_iceberg_tables or (self.schema.tables[table_name].get("write_disposition", None) == "skip") | ||
columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) | ||
|
||
# this will fail if the table prefix is not properly defined | ||
|
@@ -345,9 +365,38 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> | |
) | ||
job = super().start_file_load(table, file_path, load_id) | ||
if not job: | ||
job = DoNothingJob(file_path) | ||
job = DoNothingFollowupJob(file_path) if self.iceberg_mode else DoNothingJob(file_path) | ||
return job | ||
|
||
def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: | ||
"""Creates a list of followup jobs for merge write disposition and staging replace strategies""" | ||
jobs = super().create_table_chain_completed_followup_jobs(table_chain) | ||
|
||
# add some additional jobs | ||
write_disposition = table_chain[0]["write_disposition"] | ||
if write_disposition == "append": | ||
jobs.append(self._create_staging_copy_job(table_chain, False)) | ||
elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert": | ||
jobs.append(self._create_staging_copy_job(table_chain, False)) | ||
return jobs | ||
|
||
def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: | ||
"""update destination tables from staging tables""" | ||
if self.iceberg_mode: | ||
return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) | ||
return super()._create_staging_copy_job(table_chain, replace=replace) | ||
|
||
def get_stage_dispositions(self) -> List[TWriteDisposition]: | ||
# in iceberg mode, we always use staging tables | ||
if self.iceberg_mode: | ||
return ["append", "replace", "merge"] | ||
return super().get_stage_dispositions() | ||
|
||
def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: | ||
if self.iceberg_mode: | ||
return ["append", "replace", "merge"] | ||
return [] | ||
|
||
@staticmethod | ||
def is_dbapi_exception(ex: Exception) -> bool: | ||
return isinstance(ex, Error) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think we need
iceberg_mode
you just set it up per table