From 087a7ad8a89ea0c2d93ba80ebce26f405a297ccf Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 21 Jun 2024 06:31:01 +0000 Subject: [PATCH 1/5] Added how to retrieve secrets using google secret manager --- .../docs/walkthroughs/add_credentials.md | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md index 586d1c2a93..71ac0cccbd 100644 --- a/docs/website/docs/walkthroughs/add_credentials.md +++ b/docs/website/docs/walkthroughs/add_credentials.md @@ -74,3 +74,72 @@ DESTINATION__BIGQUERY__CREDENTIALS__PRIVATE_KEY DESTINATION__BIGQUERY__CREDENTIALS__CLIENT_EMAIL DESTINATION__BIGQUERY__LOCATION ``` + +## Retrieving credentials from Google Cloud Secret Manager +To retrieve secrets from Google Cloud Secret Manager using Python, and convert them into a dictionary format, you'll need to follow these steps. First, ensure that you have the necessary permissions to access the secrets on Google Cloud, and have the `google-cloud-secret-manager` library installed. If not, you can install it using pip: + +```sh +pip install google-cloud-secret-manager +``` +[Google Docs](https://cloud.google.com/secret-manager/docs/reference/libraries) + +Here's how you can retrieve secrets and convert them into a dictionary: + +1. **Set up the Secret Manager client**: Create a client that will interact with the Secret Manager API. +2. **Access the secret**: Use the client to access the secret's latest version. +3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary. + +Assume we store secrets in JSON format: +```json +{"api_token": "ghp_Kskdgf98dugjf98ghd...."} +``` + +In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. + +### Points to Note: + +- **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. +- **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. +- **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. + +With this setup, you can effectively retrieve secrets stored in Google Cloud Secret Manager and use them in your `dlt` pipelines as dictionaries. + +You can use this function to retrieve the secrets stored in the secret manager: + +```py +import json as json_lib # Rename the json import to avoid name conflict + +import dlt +from dlt.sources.helpers import requests +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from google.cloud import secretmanager + + +@with_config(sections=("google_secrets",)) +def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = dlt.secrets.value) -> dict: + """ + Retrieve a secret from Google Cloud Secret Manager and convert it to a dictionary. + """ + # Create the Secret Manager client with provided credentials + client = secretmanager.SecretManagerServiceClient(credentials=credentials.to_native_credentials()) + + # Build the resource name of the secret version + name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest" + + # Access the secret version + response = client.access_secret_version(request={"name": name}) + + # Decode the payload to a string and convert it to a dictionary + secret_string = response.payload.data.decode("UTF-8") + secret_dict = json_lib.loads(secret_string) + + return secret_dict + +# Retrieve secret data as a dictionary for use in other functions. +secret_data = get_secret_dict("temp-secret") + +# Fetch the list of repositories from the specified organization using the provided API token. +# The API token is retrieved from the secret data dictionary. +data = get_repositories(api_token=secret_data["api_token"], organization="dlt-hub") +``` \ No newline at end of file From dc65951aa9762daec15e98d1ba1587322dc64414 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 21 Jun 2024 07:35:59 +0000 Subject: [PATCH 2/5] Added some minor corrections to snowflake docs --- docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index c9d70f65fe..513c951f78 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -50,14 +50,14 @@ The instructions below assume that you use the default account setup that you ge --create database with standard settings CREATE DATABASE dlt_data; -- create new user - set your password here -CREATE USER loader WITH PASSWORD='' +CREATE USER loader WITH PASSWORD=''; -- we assign all permission to a role CREATE ROLE DLT_LOADER_ROLE; GRANT ROLE DLT_LOADER_ROLE TO USER loader; -- give database access to new role GRANT USAGE ON DATABASE dlt_data TO DLT_LOADER_ROLE; -- allow `dlt` to create new schemas -GRANT CREATE SCHEMA ON DATABASE dlt_data TO ROLE DLT_LOADER_ROLE +GRANT CREATE SCHEMA ON DATABASE dlt_data TO ROLE DLT_LOADER_ROLE; -- allow access to a warehouse named COMPUTE_WH GRANT USAGE ON WAREHOUSE COMPUTE_WH TO DLT_LOADER_ROLE; -- grant access to all future schemas and tables in the database From 13c95bd9aff2d9a79f8ad59da3d1e021cd077522 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:07:38 +0000 Subject: [PATCH 3/5] Updated as per comments --- .../docs/walkthroughs/add_credentials.md | 54 +++++++++++++------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md index 71ac0cccbd..093bfdda96 100644 --- a/docs/website/docs/walkthroughs/add_credentials.md +++ b/docs/website/docs/walkthroughs/add_credentials.md @@ -81,7 +81,8 @@ To retrieve secrets from Google Cloud Secret Manager using Python, and convert t ```sh pip install google-cloud-secret-manager ``` -[Google Docs](https://cloud.google.com/secret-manager/docs/reference/libraries) + +[Google Cloud Documentation: Secret Manager client libraries.](https://cloud.google.com/secret-manager/docs/reference/libraries) Here's how you can retrieve secrets and convert them into a dictionary: @@ -94,16 +95,6 @@ Assume we store secrets in JSON format: {"api_token": "ghp_Kskdgf98dugjf98ghd...."} ``` -In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. - -### Points to Note: - -- **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. -- **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. -- **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. - -With this setup, you can effectively retrieve secrets stored in Google Cloud Secret Manager and use them in your `dlt` pipelines as dictionaries. - You can use this function to retrieve the secrets stored in the secret manager: ```py @@ -137,9 +128,42 @@ def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = return secret_dict # Retrieve secret data as a dictionary for use in other functions. -secret_data = get_secret_dict("temp-secret") +secret_data = get_secret_dict("temp-secret") + +# Set up the request URL and headers +url = f"https://api.github.com/orgs/dlt-hub/repos" +headers = { + "Authorization": f"token {secret_data["api_token"]}", + "Accept": "application/vnd.github+json", +} + +# Make a request to the GitHub API to get the list of repositories +response = requests.get(url, headers=headers) + +# Set up the DLT pipeline +pipeline = dlt.pipeline( + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" + ) -# Fetch the list of repositories from the specified organization using the provided API token. -# The API token is retrieved from the secret data dictionary. -data = get_repositories(api_token=secret_data["api_token"], organization="dlt-hub") +# Run the pipeline +load_info = pipeline.run(response.json()) + +print(load_info) +``` + +In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. + +### Points to Note: + +- **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. +- **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. +- **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. + +You can set `.dlt/secrets.toml` as: + +```toml +[google_secrets.credentials] +"project_id" = "" +"private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" +"client_email" = "....gserviceaccount.com" ``` \ No newline at end of file From 74ec761c8b8d3da6a5840aba170f485ce19da845 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:36:02 +0000 Subject: [PATCH 4/5] Fixing linting error --- .../docs/walkthroughs/add_credentials.md | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md index 093bfdda96..1aa744e8d2 100644 --- a/docs/website/docs/walkthroughs/add_credentials.md +++ b/docs/website/docs/walkthroughs/add_credentials.md @@ -106,7 +106,6 @@ from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import GcpServiceAccountCredentials from google.cloud import secretmanager - @with_config(sections=("google_secrets",)) def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = dlt.secrets.value) -> dict: """ @@ -128,27 +127,28 @@ def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = return secret_dict # Retrieve secret data as a dictionary for use in other functions. -secret_data = get_secret_dict("temp-secret") +secret_data = get_secret_dict("temp-secret") # Set up the request URL and headers -url = f"https://api.github.com/orgs/dlt-hub/repos" -headers = { - "Authorization": f"token {secret_data["api_token"]}", - "Accept": "application/vnd.github+json", -} +url = "https://api.github.com/orgs/dlt-hub/repos" +headers = { + "Authorization": f"token {secret_data['api_token']}", # Use the API token from the secret data + "Accept": "application/vnd.github+json", # Set the Accept header for GitHub API +} # Make a request to the GitHub API to get the list of repositories -response = requests.get(url, headers=headers) +response = requests.get(url, headers=headers) # Set up the DLT pipeline -pipeline = dlt.pipeline( - pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" - ) +pipeline = dlt.pipeline( + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" +) -# Run the pipeline -load_info = pipeline.run(response.json()) +# Run the pipeline with the data from the GitHub API response +load_info = pipeline.run(response.json()) -print(load_info) +# Print the load information to check the results +print(load_info) ``` In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. From 68e32421afddae563a7e0579b6720362db158b77 Mon Sep 17 00:00:00 2001 From: Alena Date: Fri, 21 Jun 2024 17:05:02 +0200 Subject: [PATCH 5/5] small correction --- .../docs/walkthroughs/add_credentials.md | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md index 1aa744e8d2..5b4f241d56 100644 --- a/docs/website/docs/walkthroughs/add_credentials.md +++ b/docs/website/docs/walkthroughs/add_credentials.md @@ -90,12 +90,22 @@ Here's how you can retrieve secrets and convert them into a dictionary: 2. **Access the secret**: Use the client to access the secret's latest version. 3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary. -Assume we store secrets in JSON format: +Assume we store secrets in JSON format with name "temp-secret": ```json {"api_token": "ghp_Kskdgf98dugjf98ghd...."} ``` -You can use this function to retrieve the secrets stored in the secret manager: +Set `.dlt/secrets.toml` as: + +```toml +[google_secrets.credentials] +"project_id" = "" +"private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" +"client_email" = "....gserviceaccount.com" +``` +or `GOOGLE_SECRETS__CREDENTIALS` to the path of your service account key file. + +Retrieve the secrets stored in the Secret Manager as follows: ```py import json as json_lib # Rename the json import to avoid name conflict @@ -113,13 +123,13 @@ def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = """ # Create the Secret Manager client with provided credentials client = secretmanager.SecretManagerServiceClient(credentials=credentials.to_native_credentials()) - + # Build the resource name of the secret version name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest" # Access the secret version response = client.access_secret_version(request={"name": name}) - + # Decode the payload to a string and convert it to a dictionary secret_string = response.payload.data.decode("UTF-8") secret_dict = json_lib.loads(secret_string) @@ -143,27 +153,14 @@ response = requests.get(url, headers=headers) pipeline = dlt.pipeline( pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" ) - # Run the pipeline with the data from the GitHub API response load_info = pipeline.run(response.json()) - # Print the load information to check the results print(load_info) ``` -In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. - ### Points to Note: - **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. -- **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. -- **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. - -You can set `.dlt/secrets.toml` as: - -```toml -[google_secrets.credentials] -"project_id" = "" -"private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" -"client_email" = "....gserviceaccount.com" -``` \ No newline at end of file +- **Secret format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. +- **Google Cloud authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. \ No newline at end of file