From 34e77acfe27810d2f93250b65df1d69f00c6ac61 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 15 May 2024 04:19:25 +0000 Subject: [PATCH 1/5] Added info about how to reorder the columns --- .../docs/walkthroughs/adjust-a-schema.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index cfe2d056b0..35f4cb56f8 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -121,6 +121,32 @@ Do not rename the tables or columns in the yaml file. `dlt` infers those from th You can [adjust the schema](../general-usage/resource.md#adjust-schema) in Python before resource is loaded. ::: +### Reorder columns +To reorder the columns in your dataset, follow these steps: + +1. Initial Run: Execute the pipeline to obtain the import and export schemas. +1. Modify Export Schema: Adjust the column order as desired in the export schema. +1. Sync Import Schema: Ensure that these changes are mirrored in the import schema to maintain consistency. +1. Delete Dataset: Remove the existing dataset to prepare for the reload. +1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import yaml. + +These steps ensure that the column order in your dataset matches your specifications. + +**Another approach** to reorder columns is to use the `add_map` function. For instance, to rearrange ‘column1’, ‘column2’, and ‘column3’, you can proceed as follows: + +```py +# Define the data source and reorder columns using add_map +data_source = resource().add_map(lambda row: { + 'column3': row['column3'], + 'column1': row['column1'], + 'column2': row['column2'] +}) + +# Run the pipeline +load_info = pipeline.run(data_source) +``` + +In this example, the `add_map` function reorders columns by defining a new mapping. The lambda function specifies the desired order by rearranging the key-value pairs. When the pipeline runs, the data will load with the columns in the new order. ### Load data as json instead of generating child table or columns from flattened dicts From 2931d32e972a9d5d9fdf92530398a0e2cb378115 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Thu, 16 May 2024 04:21:48 +0000 Subject: [PATCH 2/5] Updated rest_api.md with configuration examples --- .../verified-sources/rest_api.md | 83 +++++++++++-------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 1f79055d06..9c24bc1b9a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -174,13 +174,13 @@ The configuration object passed to the REST API Generic Source has three main el ```py config: RESTAPIConfig = { "client": { - ... + # ... }, "resource_defaults": { - ... + # ... }, "resources": [ - ... + # ... ], } ``` @@ -203,7 +203,9 @@ For example, you can set the primary key, write disposition, and other default s ```py config = { "client": { - ... + "api_key": "your_api_key_here", + "base_url": "https://api.example.com", + # Add other client configurations here }, "resource_defaults": { "primary_key": "id", @@ -216,7 +218,7 @@ config = { }, "resources": [ "resource1", - "resource2": { + { "name": "resource2_name", "write_disposition": "append", "endpoint": { @@ -309,7 +311,7 @@ To specify the pagination configuration, use the `paginator` field in the [clien ```py { - ... + # ... "paginator": { "type": "json_links", "next_url_path": "paging.next", @@ -321,7 +323,7 @@ Or using the paginator instance: ```py { - ... + # ... "paginator": JSONResponsePaginator( next_url_path="paging.next" ), @@ -394,11 +396,11 @@ One of the most common method is token-based authentication. To authenticate wit ```py { "client": { - ... + # ... "auth": { "token": dlt.secrets["your_api_token"], }, - ... + # ... }, } ``` @@ -424,7 +426,7 @@ To specify the authentication configuration, use the `auth` field in the [client "type": "bearer", "token": dlt.secrets["your_api_token"], }, - ... + # ... }, } ``` @@ -438,7 +440,7 @@ config = { "client": { "auth": BearTokenAuth(dlt.secrets["your_api_token"]), }, - ... + # ... } ``` @@ -455,7 +457,7 @@ In the GitHub example, the `issue_comments` resource depends on the `issues` res "name": "issues", "endpoint": { "path": "issues", - ... + # ... }, }, { @@ -495,13 +497,15 @@ The `issue_comments` resource will make requests to the following endpoints: The syntax for the `resolve` field in parameter configuration is: ```py -"": { - "type": "resolve", - "resource": "", - "field": "", -} +({ + "{parameter_name}" : + { + "type": "resolve", + "resource": "{parent_resource_name}", + "field": "{parent_resource_field_name}", + } +}) ``` - Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). #### Include fields from the parent resource @@ -512,7 +516,7 @@ You can include data from the parent resource in the child resource by using the { "name": "issue_comments", "endpoint": { - ... + # ... }, "include_from_parent": ["id", "title", "created_at"], } @@ -530,21 +534,26 @@ When the API endpoint supports incremental loading, you can configure the source 1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): ```py - "": { - "type": "incremental", - "cursor_path": "", - "initial_value": "", - }, + + ({ + "": { + "type": "incremental", + "cursor_path": "", + "initial_value": "", + } + }) ``` For example, in the `issues` resource configuration in the GitHub example, we have: ```py - "since": { - "type": "incremental", - "cursor_path": "updated_at", - "initial_value": "2024-01-25T11:21:28Z", - }, + ({ + "since": { + "type": "incremental", + "cursor_path": "updated_at", + "initial_value": "2024-01-25T11:21:28Z", + } + }) ``` This configuration tells the source to create an incremental object that will keep track of the `updated_at` field in the response and use it as a value for the `since` parameter in subsequent requests. @@ -552,13 +561,15 @@ When the API endpoint supports incremental loading, you can configure the source 2. Specifying the `incremental` field in the [endpoint configuration](#endpoint-configuration): ```py - "incremental": { - "start_param": "", - "end_param": "", - "cursor_path": "", - "initial_value": "", - "end_value": "", - }, + ({ + "incremental": { + "start_param": "", + "end_param": "", + "cursor_path": "", + "initial_value": "", + "end_value": "", + } + }) ``` This configuration is more flexible and allows you to specify the start and end conditions for the incremental loading. From 621ee54fdcb4f723020d4347ec1ee59a670b6da4 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 16 May 2024 17:06:47 +0200 Subject: [PATCH 3/5] Update docs/website/docs/walkthroughs/adjust-a-schema.md --- docs/website/docs/walkthroughs/adjust-a-schema.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index 35f4cb56f8..b0a9a9ce05 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -128,7 +128,7 @@ To reorder the columns in your dataset, follow these steps: 1. Modify Export Schema: Adjust the column order as desired in the export schema. 1. Sync Import Schema: Ensure that these changes are mirrored in the import schema to maintain consistency. 1. Delete Dataset: Remove the existing dataset to prepare for the reload. -1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import yaml. +1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import YAML. These steps ensure that the column order in your dataset matches your specifications. From 397eeae53cab1c6ed6e675b9fefed5e9e612f0d9 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 22 May 2024 03:49:25 +0000 Subject: [PATCH 4/5] Updated ../website/docs/dlt-ecosystem/verified-sources/rest_api.md --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 9c24bc1b9a..1367b96bd4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -534,7 +534,6 @@ When the API endpoint supports incremental loading, you can configure the source 1. Defining a special parameter in the `params` section of the [endpoint configuration](#endpoint-configuration): ```py - ({ "": { "type": "incremental", From 2419fb3d69d72eb5ca638d118aec4f2a683e9e24 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Wed, 22 May 2024 18:24:45 +0200 Subject: [PATCH 5/5] fix naming convention for bigquery custom destination --- .../custom_destination_bigquery.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index ea60b9b00d..e890469263 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -5,13 +5,13 @@ keywords: [destination, credentials, example, bigquery, custom destination] --- -In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination. +In this example, you'll find a Python script that demonstrates how to load to BigQuery with the custom destination. We'll learn how to: -- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) -- use the [custom destination](../dlt-ecosystem/destinations/destination.md) -- Use pyarrow tables to create complex column types on bigquery -- Use bigquery `autodetect=True` for schema inference from parquet files +- Use [built-in credentials.](../general-usage/credentials/config_specs#gcp-credentials) +- Use the [custom destination.](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on BigQuery. +- Use BigQuery `autodetect=True` for schema inference from parquet files. """ @@ -38,7 +38,7 @@ def resource(url: str): # load pyarrow table with pandas table = pa.Table.from_pandas(pd.read_csv(url)) - # we add a list type column to demontrate bigquery lists + # we add a list type column to demonstrate bigquery lists table = table.append_column( "tags", pa.array( @@ -57,12 +57,12 @@ def resource(url: str): yield table -# dlt biquery custom destination +# dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) +@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") def bigquery_insert( - items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value + items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: client = bigquery.Client( credentials.project_id, credentials.to_native_credentials(), location="US" @@ -74,7 +74,7 @@ def bigquery_insert( ) # since we have set the batch_size to 0, we get a filepath and can load the file directly with open(items, "rb") as f: - load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job = client.load_table_from_file(f, table, job_config=job_config) load_job.result() # Waits for the job to complete.