From 358806fcc697a86cf68d51e1664fd8eeafb44eb7 Mon Sep 17 00:00:00 2001 From: Marcel Coetzee Date: Mon, 15 Jan 2024 23:21:12 +0200 Subject: [PATCH] Update BigQuery documentation and comments #852 Signed-off-by: Marcel Coetzee --- dlt/destinations/impl/bigquery/bigquery.py | 6 +++--- .../docs/dlt-ecosystem/destinations/bigquery.md | 12 +++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 0ca3e6c45c..254184b96d 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -46,7 +46,7 @@ class BigQueryTypeMapper(TypeMapper): "timestamp": "TIMESTAMP", "bigint": "INTEGER", "binary": "BYTES", - "wei": "BIGNUMERIC", # non parametrised should hold wei values + "wei": "BIGNUMERIC", # non-parametrized should hold wei values "time": "TIME", } @@ -106,7 +106,7 @@ def state(self) -> TLoadJobState: f"Got reason {reason} for job {self.file_name}, job considered still" f" running. ({self.bq_load_job.error_result})" ) - # the status of the job could not be obtained, job still running + # the status of the job couldn't be obtained, job still running return "running" else: # retry on all other reasons, including `backendError` which requires retry when the job is done @@ -283,7 +283,7 @@ def _get_table_update_sql( sql[0] = ( f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})" ) - # BigQuery supports partitioning only when bigint represents a UNIX timestamp. + # Automatic partitioning of an INT64 type requires us to be prescriptive - we treat the column as a UNIX timestamp. # This is due to the bounds requirement of GENERATE_ARRAY function for partitioning. # The 10,000 partitions limit makes it infeasible to cover the entire `bigint` range. # The array bounds, with daily partitions (86400 seconds in a day), are somewhat arbitrarily chosen. diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index dcb35569a0..b4487d21cf 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -121,16 +121,22 @@ When staging is enabled: BigQuery supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE` or `RANGE_BUCKET`). +* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE`). May be used with `datetime`, `date` data types and `bigint` **only if** it contains valid UNIX timestamps. Only one column per table is supported and only when a new table is created. + For more information on BigQuery partitioning, read the [official docs](https://cloud.google.com/bigquery/docs/partitioned-tables). + + > ❗ `bigint` maps to BigQuery’s **INT64** data type. + > Automatic partitioning requires converting an INT64 column to a UNIX timestamp, which `GENERATE_ARRAY` doesn’t natively support. + > With a 10,000 partition limit, we can’t cover the full INT64 range. + > Instead, we set 86,400 second boundaries to enable daily partitioning. + > This captures typical values, but extremely large/small outliers go to an `__UNPARTITIONED__` catch-all partition. + * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ## Staging Support BigQuery supports gcs as a file staging destination. dlt will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. -```toml -``` Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`.