diff --git a/.github/ISSUE_TEMPLATE/documentation_request.yml b/.github/ISSUE_TEMPLATE/documentation_request.yml new file mode 100644 index 0000000000..fe6f8fc4b1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_request.yml @@ -0,0 +1,28 @@ +--- +name: Documentation Request +description: Suggest improvements or additions to dlt documentation +labels: ["kind:documentation"] +body: + - type: markdown + attributes: + value: | + Thanks for contributing to the dlt documentation! + For significant updates or new content, open an issue to discuss your ideas. This helps us + coordinate and integrate your suggestions effectively. For simple edits like typos, you can + directly submit a [pull request](https://github.com/dlt-hub/dlt/pulls). + - type: textarea + attributes: + label: Documentation description + description: Describe the documentation improvements or additions you'd like to see. + placeholder: > + What specific documentation changes are you suggesting? + - type: dropdown + attributes: + label: Are you a dlt user? + description: Please tell us how you use dlt and how this impacts your need for documentation. + multiple: false + options: + - "Yes, I'm already a dlt user." + - "Yes, I run dlt in production." + - "Yes, I use it for fun." + - "I'm considering using dlt, but the documentation is unclear." diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 96dae8044c..35ccb71ab5 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,7 +25,7 @@ jobs: defaults: run: shell: bash - runs-on: ${{ matrix.os }} + runs-on: ${{ matrix.os }} steps: @@ -42,7 +42,7 @@ jobs: with: virtualenvs-create: true virtualenvs-in-project: true - installer-parallel: true + installer-parallel: true - name: Load cached venv id: cached-poetry-dependencies @@ -57,7 +57,7 @@ jobs: - name: Run make lint run: | - export PATH=$PATH:"/c/Program Files/usr/bin" # needed for Windows + export PATH=$PATH:"/c/Program Files/usr/bin" # needed for Windows make lint # - name: print envs diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index 1803a53fc1..5ae791c979 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -9,17 +9,8 @@ on: workflow_dispatch: env: - # all credentials must be present to be passed to dbt runner - DESTINATION__POSTGRES__CREDENTIALS: postgresql://loader@dlttests.cwz0jfxu0m7m.eu-central-1.rds.amazonaws.com:5432/dlt_data - DESTINATION__REDSHIFT__CREDENTIALS: postgresql://loader@3.73.90.3:5439/dlt_ci - DESTINATION__SNOWFLAKE__CREDENTIALS: snowflake://loader@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE - DESTINATION__CREDENTIALS__PASSWORD: ${{ secrets.PG_PASSWORD }} - - DESTINATION__CREDENTIALS__PROJECT_ID: chat-analytics-rasa-ci - DESTINATION__CREDENTIALS__CLIENT_EMAIL: chat-analytics-loader@chat-analytics-rasa-ci.iam.gserviceaccount.com - DESTINATION__BIGQUERY__CREDENTIALS__PRIVATE_KEY: ${{ secrets.BQ_CRED_PRIVATE_KEY }} - DESTINATION__CREDENTIALS__TOKEN_URI: https://oauth2.googleapis.com/token + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__LOG_LEVEL: ERROR jobs: @@ -70,6 +61,9 @@ jobs: # install dlt with postgres support run: poetry install --no-interaction -E postgres -E dbt --with sentry-sdk + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: | poetry run pytest tests/helpers/dbt_tests -k '(not venv)' if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 2d57f26a51..e9e17edefe 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -9,12 +9,7 @@ on: workflow_dispatch: env: - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 - DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -72,6 +67,9 @@ jobs: # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: | poetry run pytest tests/load if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index d8d8521063..fa45b1b49b 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -9,12 +9,7 @@ on: workflow_dispatch: env: - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 - DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -70,7 +65,10 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E --with sentry-sdk --with pipeline + + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 45dc78a085..00027768a5 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -9,17 +9,7 @@ on: workflow_dispatch: env: - CREDENTIALS__PROJECT_ID: chat-analytics-rasa-ci - CREDENTIALS__CLIENT_EMAIL: chat-analytics-loader@chat-analytics-rasa-ci.iam.gserviceaccount.com - CREDENTIALS__PRIVATE_KEY: ${{ secrets.BQ_CRED_PRIVATE_KEY }} - CREDENTIALS__TOKEN_URI: https://oauth2.googleapis.com/token - CREDENTIALS__CLIENT_ID: 929384042504-3mtjaj1s7vuvf53j88mgdq4te7akkjm3.apps.googleusercontent.com - CREDENTIALS__CLIENT_SECRET: ${{ secrets.CREDENTIALS__CLIENT_SECRET }} - CREDENTIALS__REFRESH_TOKEN: ${{ secrets.CREDENTIALS__REFRESH_TOKEN }} - - # needed for bigquery staging tests - # DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - # DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -81,8 +71,8 @@ jobs: # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline - # - name: Install self - # run: poetry install --no-interaction + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | poetry run pytest tests/helpers/providers tests/load diff --git a/.github/workflows/test_destination_weaviate.yml b/.github/workflows/test_destination_databricks.yml similarity index 69% rename from .github/workflows/test_destination_weaviate.yml rename to .github/workflows/test_destination_databricks.yml index c771a28204..f301a1b9ed 100644 --- a/.github/workflows/test_destination_weaviate.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -1,4 +1,5 @@ -name: test weaviate + +name: test databricks on: pull_request: @@ -8,14 +9,12 @@ on: workflow_dispatch: env: - DESTINATION__WEAVIATE__CREDENTIALS__URL: ${{ secrets.DESTINATION__WEAVIATE__CREDENTIALS__URL }} - DESTINATION__WEAVIATE__CREDENTIALS__API_KEY: ${{ secrets.DESTINATION__WEAVIATE__CREDENTIALS__API_KEY }} - DESTINATION__WEAVIATE__CREDENTIALS__ADDITIONAL_HEADERS: ${{ secrets.DESTINATION__WEAVIATE__CREDENTIALS__ADDITIONAL_HEADERS }} + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR - ACTIVE_DESTINATIONS: "[\"weaviate\"]" + ACTIVE_DESTINATIONS: "[\"databricks\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" jobs: @@ -24,20 +23,21 @@ jobs: if: ${{ !github.event.pull_request.head.repo.fork }} run_loader: - name: Tests Weaviate loader + name: Tests Databricks loader needs: get_docs_changes - if: !always() - # if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: ["ubuntu-latest"] + # os: ["ubuntu-latest", "macos-latest", "windows-latest"] defaults: run: shell: bash runs-on: ${{ matrix.os }} steps: + - name: Check out uses: actions/checkout@master @@ -61,19 +61,23 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E weaviate -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: | - poetry run pytest tests/load/ + poetry run pytest tests/load if: runner.os != 'Windows' name: Run tests Linux/MAC - run: | - poetry run pytest tests/load/ + poetry run pytest tests/load if: runner.os == 'Windows' name: Run tests Windows shell: cmd matrix_job_required_check: - name: Weaviate loader tests + name: Databricks loader tests needs: run_loader runs-on: ubuntu-latest if: always() diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 6eb4427bbf..d1da25c067 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -9,8 +9,8 @@ on: workflow_dispatch: env: - DESTINATION__MSSQL__CREDENTIALS: mssql://dlt_root@dlt-ci-mssql.database.windows.net:1433/dlt_ci - DESTINATION__MSSQL__CREDENTIALS__PASSWORD: ${{ secrets.MSSQL_PASSWORD }} + + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -67,12 +67,15 @@ jobs: - name: Install dependencies run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os != 'Windows' name: Run tests Linux/MAC - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os == 'Windows' name: Run tests Windows shell: cmd diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 0ce3e3a3f9..758c18b56b 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -8,8 +8,7 @@ on: workflow_dispatch: env: - DESTINATION__QDRANT__CREDENTIALS__LOCATION: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__LOCATION }} - DESTINATION__QDRANT__CREDENTIALS__API_KEY: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__API_KEY }} + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -58,6 +57,9 @@ jobs: path: .venv key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - name: Install dependencies run: poetry install --no-interaction -E qdrant -E parquet --with sentry-sdk --with pipeline - run: | diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index afc4263daf..979ea3e917 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -9,17 +9,7 @@ on: workflow_dispatch: env: - DESTINATION__SNOWFLAKE__CREDENTIALS: snowflake://loader@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE - CREDENTIALS__PASSWORD: ${{ secrets.PG_PASSWORD }} - - # needed for snowflake staging tests - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__FILESYSTEM__CREDENTIALS__PROJECT_ID: chat-analytics-rasa-ci - DESTINATION__FILESYSTEM__CREDENTIALS__CLIENT_EMAIL: chat-analytics-loader@chat-analytics-rasa-ci.iam.gserviceaccount.com - DESTINATION__FILESYSTEM__CREDENTIALS__PRIVATE_KEY: ${{ secrets.BQ_CRED_PRIVATE_KEY }} - DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME: dltdata - DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY: ${{ secrets.AZURE_STORAGE_ACCOUNT_KEY }} + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -73,6 +63,9 @@ jobs: - name: Install dependencies run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: | poetry run pytest tests/load if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index d0f364c382..ecd890d32a 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -5,12 +5,10 @@ on: branches: - master - devel - workflow_dispatch: env: - DESTINATION__SYNAPSE__CREDENTIALS: ${{ secrets.SYNAPSE_CREDENTIALS }} - DESTINATION__SYNAPSE__CREDENTIALS__PASSWORD: ${{ secrets.SYNAPSE_PASSWORD }} + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://cf6086f7d263462088b9fb9f9947caee@o4505514867163136.ingest.sentry.io/4505516212682752 RUNTIME__LOG_LEVEL: ERROR @@ -19,19 +17,14 @@ env: ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" jobs: - - build: - runs-on: ubuntu-latest - - steps: - - name: Check source branch name - run: | - if [[ "${{ github.head_ref }}" != "synapse" ]]; then - exit 1 - fi + get_docs_changes: + uses: ./.github/workflows/get_docs_changes.yml + if: ${{ !github.event.pull_request.head.repo.fork }} run_loader: name: Tests Synapse loader + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' strategy: fail-fast: false matrix: @@ -70,14 +63,17 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E s3 -E gs -E az --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os != 'Windows' name: Run tests Linux/MAC - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os == 'Windows' name: Run tests Windows shell: cmd diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 6aaf440dcd..c60d870b05 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -9,30 +9,14 @@ on: workflow_dispatch: env: - DESTINATION__POSTGRES__CREDENTIALS: postgresql://loader@dlttests.cwz0jfxu0m7m.eu-central-1.rds.amazonaws.com:5432/dlt_data - DESTINATION__DUCKDB__CREDENTIALS: duckdb:///_storage/test_quack.duckdb - DESTINATION__REDSHIFT__CREDENTIALS: postgresql://loader@3.73.90.3:5439/dlt_ci - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME: dltdata - DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY: ${{ secrets.AZURE_STORAGE_ACCOUNT_KEY }} + + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} # For s3 compatible tests TESTS__R2_AWS_ACCESS_KEY_ID: a4950a5003b26f5a71ac97ef3848ff4c TESTS__R2_AWS_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_SECRET_ACCESS_KEY }} TESTS__R2_ENDPOINT_URL: https://9830548e4e4b582989be0811f2a0a97f.r2.cloudflarestorage.com - # DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4LGORLZOK - # DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - # DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 - # DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output - - # password is the same so it will be shared - CREDENTIALS__PROJECT_ID: chat-analytics-rasa-ci - CREDENTIALS__CLIENT_EMAIL: chat-analytics-loader@chat-analytics-rasa-ci.iam.gserviceaccount.com - CREDENTIALS__PRIVATE_KEY: ${{ secrets.BQ_CRED_PRIVATE_KEY }} - CREDENTIALS__PASSWORD: ${{ secrets.PG_PASSWORD }} - RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB @@ -89,8 +73,8 @@ jobs: # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline - # - name: Install self - # run: poetry install --no-interaction + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 29e7db0c4d..80b77ce6c9 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -11,22 +11,13 @@ on: env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} - DESTINATION__DUCKDB__CREDENTIALS: duckdb:///_storage/test_quack.duckdb - RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB - DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary - DESTINATION__WEAVIATE__MODULE_CONFIG: "{\"text2vec-contextionary\": {\"vectorizeClassName\": false, \"vectorizePropertyName\": true}}" - # Slack hook for chess in production example RUNTIME__SLACK_INCOMING_HOOK: ${{ secrets.RUNTIME__SLACK_INCOMING_HOOK }} - # Qdrant credentials - DESTINATION__QDRANT__CREDENTIALS__LOCATION: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__LOCATION }} - DESTINATION__QDRANT__CREDENTIALS__API_KEY: ${{ secrets.DESTINATION__QDRANT__CREDENTIALS__API_KEY }} - jobs: run_lint: diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 42c3c2d13a..a02957b69d 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -11,7 +11,7 @@ on: workflow_dispatch: env: - DESTINATION__DUCKDB__CREDENTIALS: duckdb:///_storage/test_quack.duckdb + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR @@ -19,9 +19,6 @@ env: ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\"]" - DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary - DESTINATION__WEAVIATE__MODULE_CONFIG: "{\"text2vec-contextionary\": {\"vectorizeClassName\": false, \"vectorizePropertyName\": true}}" - jobs: get_docs_changes: uses: ./.github/workflows/get_docs_changes.yml @@ -86,6 +83,9 @@ jobs: - name: Install dependencies run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + - run: poetry run pytest tests/load && poetry run pytest tests/cli name: Run tests Linux env: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 29601f50ce..27156461de 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -105,7 +105,7 @@ Before publishing a new release, make sure to bump the project's version accordi 2. Use `poetry version patch` to increase the **patch** version 3. Run `make build-library` to apply the changes to the project. 4. Create a new branch, and submit the PR to **devel**. Go through standard process to merge it. -5. Create a merge PR from `devel` to `master` and merge it. +5. Create a merge PR from `devel` to `master` and merge it with a merge commit. ### Hotfix release 1. Check out the **master** branch diff --git a/Makefile b/Makefile index c1cb9bec98..bd425b0e42 100644 --- a/Makefile +++ b/Makefile @@ -94,3 +94,4 @@ test-build-images: build-library docker build -f deploy/dlt/Dockerfile.airflow --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . + diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 1d5f7ce932..9894227046 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -498,7 +498,10 @@ def main() -> int: ) pipe_cmd_schema = pipeline_subparsers.add_parser("schema", help="Displays default schema") pipe_cmd_schema.add_argument( - "--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format" + "--format", + choices=["json", "yaml"], + default="yaml", + help="Display schema in this format", ) pipe_cmd_schema.add_argument( "--remove-defaults", action="store_true", help="Does not show default hint values" diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 91f64763d3..9981fa8493 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -263,7 +263,13 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.warning("Pipeline does not have a default schema") else: fmt.echo("Found schema with name %s" % fmt.bold(p.default_schema_name)) - schema_str = p.default_schema.to_pretty_yaml(remove_defaults=True) + format_ = command_kwargs.get("format") + remove_defaults_ = command_kwargs.get("remove_defaults") + s = p.default_schema + if format_ == "json": + schema_str = json.dumps(s.to_dict(remove_defaults=remove_defaults_), pretty=True) + else: + schema_str = s.to_pretty_yaml(remove_defaults=remove_defaults_) fmt.echo(schema_str) if operation == "drop": diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index f6df1d8cce..ee7360e2cb 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -38,6 +38,13 @@ def to_native_representation(self) -> Dict[str, Optional[str]]: """Return a dict that can be passed as kwargs to boto3 session""" return dict(self) + def to_session_credentials(self) -> Dict[str, str]: + return dict( + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token, + ) + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): @@ -47,6 +54,23 @@ def on_partial(self) -> None: if self._from_session(session) and not self.is_partial(): self.resolve() + def to_session_credentials(self) -> Dict[str, str]: + """Return configured or new aws session token""" + if self.aws_session_token and self.aws_access_key_id and self.aws_secret_access_key: + return dict( + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token, + ) + sess = self._to_botocore_session() + client = sess.create_client("sts") + token = client.get_session_token() + return dict( + aws_access_key_id=token["Credentials"]["AccessKeyId"], + aws_secret_access_key=token["Credentials"]["SecretAccessKey"], + aws_session_token=token["Credentials"]["SessionToken"], + ) + def _to_botocore_session(self) -> Any: try: import botocore.session diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index f00ec62651..431f35c8d0 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -1,5 +1,5 @@ import sys -from typing import Any, Final, List, Tuple, Union +from typing import Any, Final, List, Tuple, Union, Dict from dlt.common import json, pendulum from dlt.common.configuration.specs.api_credentials import OAuth2Credentials @@ -27,7 +27,9 @@ class GcpCredentials(CredentialsConfiguration): project_id: str = None - location: str = ( # DEPRECATED! and present only for backward compatibility. please set bigquery location in BigQuery configuration + location: ( + str + ) = ( # DEPRECATED! and present only for backward compatibility. please set bigquery location in BigQuery configuration "US" ) @@ -48,6 +50,20 @@ def _from_info_dict(self, info: StrAny) -> None: def __str__(self) -> str: return f"{self.project_id}" + def to_gcs_credentials(self) -> Dict[str, Any]: + """ + Dict of keyword arguments can be passed to gcsfs. + Delegates default GCS credential handling to gcsfs. + """ + return { + "project": self.project_id, + "token": ( + None + if isinstance(self, CredentialsWithDefault) and self.has_default_credentials() + else dict(self) + ), + } + @configspec class GcpServiceAccountCredentialsWithoutDefaults(GcpCredentials): diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 5bf8f29ccb..5460657253 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -98,8 +98,14 @@ def escape_mssql_literal(v: Any) -> Any: json.dumps(v), prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE ) if isinstance(v, bytes): - base_64_string = base64.b64encode(v).decode("ascii") - return f"""CAST('' AS XML).value('xs:base64Binary("{base_64_string}")', 'VARBINARY(MAX)')""" + from dlt.destinations.impl.mssql.mssql import VARBINARY_MAX_N + + if len(v) <= VARBINARY_MAX_N: + n = str(len(v)) + else: + n = "MAX" + return f"CONVERT(VARBINARY({n}), '{v.hex()}', 2)" + if isinstance(v, bool): return str(int(v)) if v is None: @@ -124,3 +130,24 @@ def escape_snowflake_identifier(v: str) -> str: # Snowcase uppercase all identifiers unless quoted. Match this here so queries on information schema work without issue # See also https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers return escape_postgres_identifier(v.upper()) + + +escape_databricks_identifier = escape_bigquery_identifier + + +DATABRICKS_ESCAPE_DICT = {"'": "\\'", "\\": "\\\\", "\n": "\\n", "\r": "\\r"} + + +def escape_databricks_literal(v: Any) -> Any: + if isinstance(v, str): + return _escape_extended(v, prefix="'", escape_dict=DATABRICKS_ESCAPE_DICT) + if isinstance(v, (datetime, date, time)): + return f"'{v.isoformat()}'" + if isinstance(v, (list, dict)): + return _escape_extended(json.dumps(v), prefix="'", escape_dict=DATABRICKS_ESCAPE_DICT) + if isinstance(v, bytes): + return f"X'{v.hex()}'" + if v is None: + return "NULL" + + return str(v) diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 0f9ff09259..0f3640da1e 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -175,18 +175,29 @@ def write_header(self, columns_schema: TTableSchemaColumns) -> None: # do not write INSERT INTO command, this must be added together with table name by the loader self._f.write("INSERT INTO {}(") self._f.write(",".join(map(self._caps.escape_identifier, headers))) - self._f.write(")\nVALUES\n") + if self._caps.insert_values_writer_type == "default": + self._f.write(")\nVALUES\n") + elif self._caps.insert_values_writer_type == "select_union": + self._f.write(")\n") def write_data(self, rows: Sequence[Any]) -> None: super().write_data(rows) - def write_row(row: StrAny) -> None: + def write_row(row: StrAny, last_row: bool = False) -> None: output = ["NULL"] * len(self._headers_lookup) for n, v in row.items(): output[self._headers_lookup[n]] = self._caps.escape_literal(v) - self._f.write("(") - self._f.write(",".join(output)) - self._f.write(")") + if self._caps.insert_values_writer_type == "default": + self._f.write("(") + self._f.write(",".join(output)) + self._f.write(")") + if not last_row: + self._f.write(",\n") + elif self._caps.insert_values_writer_type == "select_union": + self._f.write("SELECT ") + self._f.write(",".join(output)) + if not last_row: + self._f.write("\nUNION ALL\n") # if next chunk add separator if self._chunks_written > 0: @@ -195,10 +206,9 @@ def write_row(row: StrAny) -> None: # write rows for row in rows[:-1]: write_row(row) - self._f.write(",\n") # write last row without separator so we can write footer eventually - write_row(rows[-1]) + write_row(rows[-1], last_row=True) self._chunks_written += 1 def write_footer(self) -> None: diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 2596b2bf99..a78a31fdf3 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -52,6 +52,10 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): schema_supports_numeric_precision: bool = True timestamp_precision: int = 6 max_rows_per_insert: Optional[int] = None + insert_values_writer_type: str = "default" + supports_multiple_statements: bool = True + supports_clone_table: bool = False + """Destination supports CREATE TABLE ... CLONE ... statements""" # do not allow to create default value, destination caps must be always explicitly inserted into container can_create_default: ClassVar[bool] = False @@ -77,4 +81,5 @@ def generic_capabilities( caps.is_max_text_data_type_length_in_bytes = True caps.supports_ddl_transactions = True caps.supports_transactions = True + caps.supports_multiple_statements = True return caps diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index 58829f0592..872f352178 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -14,10 +14,11 @@ ) from typing_extensions import Annotated, get_args, get_origin +from dlt.common.data_types import py_type_to_sc_type from dlt.common.exceptions import MissingDependencyException from dlt.common.schema import DataValidationError from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns -from dlt.common.data_types import py_type_to_sc_type +from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.typing import ( TDataItem, TDataItems, @@ -52,6 +53,9 @@ _TPydanticModel = TypeVar("_TPydanticModel", bound=BaseModel) +snake_case_naming_convention = SnakeCaseNamingConvention() + + class ListModel(BaseModel, Generic[_TPydanticModel]): items: List[_TPydanticModel] @@ -71,7 +75,7 @@ class DltConfig(TypedDict, total=False): def pydantic_to_table_schema_columns( - model: Union[BaseModel, Type[BaseModel]] + model: Union[BaseModel, Type[BaseModel]], ) -> TTableSchemaColumns: """Convert a pydantic model to a table schema columns dict @@ -111,24 +115,47 @@ def pydantic_to_table_schema_columns( if is_list_generic_type(inner_type): inner_type = list - elif is_dict_generic_type(inner_type) or issubclass(inner_type, BaseModel): + elif is_dict_generic_type(inner_type): inner_type = dict + is_inner_type_pydantic_model = False name = field.alias or field_name try: data_type = py_type_to_sc_type(inner_type) except TypeError: - # try to coerce unknown type to text - data_type = "text" - - if data_type == "complex" and skip_complex_types: + if issubclass(inner_type, BaseModel): + data_type = "complex" + is_inner_type_pydantic_model = True + else: + # try to coerce unknown type to text + data_type = "text" + + if is_inner_type_pydantic_model and not skip_complex_types: + result[name] = { + "name": name, + "data_type": "complex", + "nullable": nullable, + } + elif is_inner_type_pydantic_model: + # This case is for a single field schema/model + # we need to generate snake_case field names + # and return flattened field schemas + schema_hints = pydantic_to_table_schema_columns(field.annotation) + + for field_name, hints in schema_hints.items(): + schema_key = snake_case_naming_convention.make_path(name, field_name) + result[schema_key] = { + **hints, + "name": snake_case_naming_convention.make_path(name, hints["name"]), + } + elif data_type == "complex" and skip_complex_types: continue - - result[name] = { - "name": name, - "data_type": data_type, - "nullable": nullable, - } + else: + result[name] = { + "name": name, + "data_type": data_type, + "nullable": nullable, + } return result @@ -261,7 +288,8 @@ def create_list_model( # TODO: use LenientList to create list model that automatically discards invalid items # https://github.com/pydantic/pydantic/issues/2274 and https://gist.github.com/dmontagu/7f0cef76e5e0e04198dd608ad7219573 return create_model( - "List" + __name__, items=(List[model], ...) # type: ignore[return-value,valid-type] + "List" + __name__, + items=(List[model], ...), # type: ignore[return-value,valid-type] ) diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py index eec379564c..e478d713b2 100644 --- a/dlt/common/runtime/collector.py +++ b/dlt/common/runtime/collector.py @@ -194,8 +194,8 @@ def dump_counters(self) -> None: elapsed_time = current_time - info.start_time items_per_second = (count / elapsed_time) if elapsed_time > 0 else 0 - progress = f"{count}/{info.total}" if info.total is not None else f"{count}" - percentage = f"({count / info.total * 100:.1f}%)" if info.total is not None else "" + progress = f"{count}/{info.total}" if info.total else f"{count}" + percentage = f"({count / info.total * 100:.1f}%)" if info.total else "" elapsed_time_str = f"{elapsed_time:.2f}s" items_per_second_str = f"{items_per_second:.2f}/s" message = f"[{self.messages[name]}]" if self.messages[name] is not None else "" diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index e95699b91e..ccfc038085 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -546,12 +546,20 @@ def data_tables(self, include_incomplete: bool = False) -> List[TTableSchema]: ) ] + def data_table_names(self) -> List[str]: + """Returns list of table table names. Excludes dlt table names.""" + return [t["name"] for t in self.data_tables()] + def dlt_tables(self) -> List[TTableSchema]: """Gets dlt tables""" return [ t for t in self._schema_tables.values() if t["name"].startswith(self._dlt_tables_prefix) ] + def dlt_table_names(self) -> List[str]: + """Returns list of dlt table names.""" + return [t["name"] for t in self.dlt_tables()] + def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 9a27cbe4bb..e1ff17115d 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -7,6 +7,7 @@ Optional, Sequence, Set, + Tuple, Type, TypedDict, NewType, diff --git a/dlt/common/source.py b/dlt/common/source.py index 249d54b4c5..ea2a25f1d7 100644 --- a/dlt/common/source.py +++ b/dlt/common/source.py @@ -34,7 +34,10 @@ def unset_current_pipe_name() -> None: def get_current_pipe_name() -> str: - """Gets pipe name associated with current thread""" + """When executed from withing dlt.resource decorated function, gets pipe name associated with current thread. + + Pipe name is the same as resource name for all currently known cases. In some multithreading cases, pipe name may be not available. + """ name = _CURRENT_PIPE_NAME.get(threading.get_ident()) if name is None: raise ResourceNameNotAvailable() diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 83e7e88189..2279698dda 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -1,9 +1,10 @@ import os -from urllib.parse import urlparse from typing import TYPE_CHECKING, Any, Literal, Optional, Type, get_args, ClassVar, Dict, Union +from urllib.parse import urlparse -from dlt.common.configuration.specs import BaseConfiguration, configspec, CredentialsConfiguration from dlt.common.configuration import configspec, resolve_type +from dlt.common.configuration.exceptions import ConfigurationValueError +from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, AwsCredentials, @@ -12,8 +13,9 @@ AzureCredentialsWithoutDefaults, BaseConfiguration, ) +from dlt.common.typing import DictStrAny from dlt.common.utils import digest128 -from dlt.common.configuration.exceptions import ConfigurationValueError + TSchemaFileFormat = Literal["json", "yaml"] SchemaFileExtensions = get_args(TSchemaFileFormat) @@ -92,9 +94,13 @@ class FilesystemConfiguration(BaseConfiguration): } bucket_url: str = None - # should be an union of all possible credentials as found in PROTOCOL_CREDENTIALS + + # should be a union of all possible credentials as found in PROTOCOL_CREDENTIALS credentials: FileSystemCredentials + kwargs: Optional[DictStrAny] = None + client_kwargs: Optional[DictStrAny] = None + @property def protocol(self) -> str: """`bucket_url` protocol""" @@ -112,7 +118,7 @@ def on_resolved(self) -> None: "File path or netloc missing. Field bucket_url of FilesystemClientConfiguration" " must contain valid url with a path or host:password component." ) - # this is just a path in local file system + # this is just a path in a local file system if url.path == self.bucket_url: url = url._replace(scheme="file") self.bucket_url = url.geturl() @@ -124,9 +130,7 @@ def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: def fingerprint(self) -> str: """Returns a fingerprint of bucket_url""" - if self.bucket_url: - return digest128(self.bucket_url) - return "" + return digest128(self.bucket_url) if self.bucket_url else "" def __str__(self) -> str: """Return displayable destination location""" @@ -141,4 +145,15 @@ def __str__(self) -> str: if TYPE_CHECKING: - def __init__(self, bucket_url: str, credentials: FileSystemCredentials = None) -> None: ... + def __init__( + self, + bucket_url: str, + credentials: FileSystemCredentials = None, + kwargs: Optional[DictStrAny] = None, + client_kwargs: Optional[DictStrAny] = None, + ) -> None: + self.bucket_url = bucket_url + self.credentials = credentials + self.kwargs = kwargs + self.client_kwargs = client_kwargs + ... diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index 18c1837e00..865c728a84 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -1,27 +1,39 @@ import io +import gzip import mimetypes -import posixpath import pathlib -from urllib.parse import urlparse +import posixpath from io import BytesIO -from typing import cast, Tuple, TypedDict, Optional, Union, Iterator, Any, IO +from typing import ( + Literal, + cast, + Tuple, + TypedDict, + Optional, + Union, + Iterator, + Any, + IO, + Dict, + Callable, + Sequence, +) +from urllib.parse import urlparse -from fsspec.core import url_to_fs from fsspec import AbstractFileSystem +from fsspec.core import url_to_fs +from dlt import version from dlt.common import pendulum -from dlt.common.exceptions import MissingDependencyException -from dlt.common.time import ensure_pendulum_datetime -from dlt.common.typing import DictStrAny from dlt.common.configuration.specs import ( - CredentialsWithDefault, GcpCredentials, AwsCredentials, AzureCredentials, ) +from dlt.common.exceptions import MissingDependencyException from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration - -from dlt import version +from dlt.common.time import ensure_pendulum_datetime +from dlt.common.typing import DictStrAny class FileItem(TypedDict, total=False): @@ -30,6 +42,7 @@ class FileItem(TypedDict, total=False): file_url: str file_name: str mime_type: str + encoding: Optional[str] modification_date: pendulum.DateTime size_in_bytes: int file_content: Optional[bytes] @@ -50,20 +63,62 @@ class FileItem(TypedDict, total=False): MTIME_DISPATCH["s3a"] = MTIME_DISPATCH["s3"] MTIME_DISPATCH["abfs"] = MTIME_DISPATCH["az"] +# Map of protocol to a filesystem type +CREDENTIALS_DISPATCH: Dict[str, Callable[[FilesystemConfiguration], DictStrAny]] = { + "s3": lambda config: cast(AwsCredentials, config.credentials).to_s3fs_credentials(), + "adl": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), + "az": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), + "gcs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), + "gs": lambda config: cast(GcpCredentials, config.credentials).to_gcs_credentials(), + "abfs": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), + "azure": lambda config: cast(AzureCredentials, config.credentials).to_adlfs_credentials(), +} + def fsspec_filesystem( - protocol: str, credentials: FileSystemCredentials = None + protocol: str, + credentials: FileSystemCredentials = None, + kwargs: Optional[DictStrAny] = None, + client_kwargs: Optional[DictStrAny] = None, ) -> Tuple[AbstractFileSystem, str]: """Instantiates an authenticated fsspec `FileSystem` for a given `protocol` and credentials. - Please supply credentials instance corresponding to the protocol. The `protocol` is just the code name of the filesystem ie: + Please supply credentials instance corresponding to the protocol. + The `protocol` is just the code name of the filesystem i.e.: * s3 * az, abfs * gcs, gs also see filesystem_from_config """ - return fsspec_from_config(FilesystemConfiguration(protocol, credentials)) + return fsspec_from_config( + FilesystemConfiguration(protocol, credentials, kwargs=kwargs, client_kwargs=client_kwargs) + ) + + +def prepare_fsspec_args(config: FilesystemConfiguration) -> DictStrAny: + """Prepare arguments for fsspec filesystem constructor. + + Args: + config (FilesystemConfiguration): The filesystem configuration. + + Returns: + DictStrAny: The arguments for the fsspec filesystem constructor. + """ + proto = config.protocol + fs_kwargs: DictStrAny = {"use_listings_cache": False} + credentials = CREDENTIALS_DISPATCH.get(proto, lambda _: {})(config) + + if config.kwargs is not None: + fs_kwargs.update(config.kwargs) + if config.client_kwargs is not None: + fs_kwargs["client_kwargs"] = config.client_kwargs + + if "client_kwargs" in fs_kwargs and "client_kwargs" in credentials: + fs_kwargs["client_kwargs"].update(credentials.pop("client_kwargs")) + + fs_kwargs.update(credentials) + return fs_kwargs def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSystem, str]: @@ -77,29 +132,14 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys All other filesystems are not authenticated Returns: (fsspec filesystem, normalized url) - """ - proto = config.protocol - fs_kwargs: DictStrAny = {} - if proto == "s3": - fs_kwargs.update(cast(AwsCredentials, config.credentials).to_s3fs_credentials()) - elif proto in ["az", "abfs", "adl", "azure"]: - fs_kwargs.update(cast(AzureCredentials, config.credentials).to_adlfs_credentials()) - elif proto in ["gcs", "gs"]: - assert isinstance(config.credentials, GcpCredentials) - # Default credentials are handled by gcsfs - if ( - isinstance(config.credentials, CredentialsWithDefault) - and config.credentials.has_default_credentials() - ): - fs_kwargs["token"] = None - else: - fs_kwargs["token"] = dict(config.credentials) - fs_kwargs["project"] = config.credentials.project_id + fs_kwargs = prepare_fsspec_args(config) try: - return url_to_fs(config.bucket_url, use_listings_cache=False, **fs_kwargs) # type: ignore[no-any-return] + return url_to_fs(config.bucket_url, **fs_kwargs) # type: ignore except ModuleNotFoundError as e: - raise MissingDependencyException("filesystem", [f"{version.DLT_PKG_NAME}[{proto}]"]) from e + raise MissingDependencyException( + "filesystem", [f"{version.DLT_PKG_NAME}[{config.protocol}]"] + ) from e class FileItemDict(DictStrAny): @@ -122,7 +162,7 @@ def __init__( @property def fsspec(self) -> AbstractFileSystem: - """The filesystem client based on the given credentials. + """The filesystem client is based on the given credentials. Returns: AbstractFileSystem: The fsspec client. @@ -132,36 +172,62 @@ def fsspec(self) -> AbstractFileSystem: else: return fsspec_filesystem(self["file_url"], self.credentials)[0] - def open(self, mode: str = "rb", **kwargs: Any) -> IO[Any]: # noqa: A003 + def open( # noqa: A003 + self, + mode: str = "rb", + compression: Literal["auto", "disable", "enable"] = "auto", + **kwargs: Any, + ) -> IO[Any]: """Open the file as a fsspec file. This method opens the file represented by this dictionary as a file-like object using the fsspec library. Args: + mode (Optional[str]): Open mode. + compression (Optional[str]): A flag to enable/disable compression. + Can have one of three values: "disable" - no compression applied, + "enable" - gzip compression applied, "auto" (default) - + compression applied only for files compressed with gzip. **kwargs (Any): The arguments to pass to the fsspec open function. Returns: IOBase: The fsspec file. """ + if compression == "auto": + compression_arg = "gzip" if self["encoding"] == "gzip" else None + elif compression == "enable": + compression_arg = "gzip" + elif compression == "disable": + compression_arg = None + else: + raise ValueError("""The argument `compression` must have one of the following values: + "auto", "enable", "disable".""") + opened_file: IO[Any] - # if the user has already extracted the content, we use it so there will be no need to + # if the user has already extracted the content, we use it so there is no need to # download the file again. if "file_content" in self: - bytes_io = BytesIO(self["file_content"]) - - if "t" in mode: - text_kwargs = { - k: kwargs.pop(k) for k in ["encoding", "errors", "newline"] if k in kwargs - } - return io.TextIOWrapper( - bytes_io, - **text_kwargs, - ) - else: + content = ( + gzip.decompress(self["file_content"]) + if compression_arg == "gzip" + else self["file_content"] + ) + bytes_io = BytesIO(content) + + if "t" not in mode: return bytes_io + text_kwargs = { + k: kwargs.pop(k) for k in ["encoding", "errors", "newline"] if k in kwargs + } + return io.TextIOWrapper( + bytes_io, + **text_kwargs, + ) else: - opened_file = self.fsspec.open(self["file_url"], mode=mode, **kwargs) + opened_file = self.fsspec.open( + self["file_url"], mode=mode, compression=compression_arg, **kwargs + ) return opened_file def read_bytes(self) -> bytes: @@ -170,20 +236,20 @@ def read_bytes(self) -> bytes: Returns: bytes: The file content. """ - content: bytes - # same as open, if the user has already extracted the content, we use it. - if "file_content" in self and self["file_content"] is not None: - content = self["file_content"] - else: - content = self.fsspec.read_bytes(self["file_url"]) - return content + return ( # type: ignore + self["file_content"] + if "file_content" in self and self["file_content"] is not None + else self.fsspec.read_bytes(self["file_url"]) + ) + + +def guess_mime_type(file_name: str) -> Sequence[str]: + type_ = list(mimetypes.guess_type(posixpath.basename(file_name), strict=False)) + if not type_[0]: + type_[0] = "application/" + (posixpath.splitext(file_name)[1][1:] or "octet-stream") -def guess_mime_type(file_name: str) -> str: - mime_type = mimetypes.guess_type(posixpath.basename(file_name), strict=False)[0] - if not mime_type: - mime_type = "application/" + (posixpath.splitext(file_name)[1][1:] or "octet-stream") - return mime_type + return type_ def glob_files( @@ -202,7 +268,7 @@ def glob_files( import os bucket_url_parsed = urlparse(bucket_url) - # if this is file path without scheme + # if this is a file path without a scheme if not bucket_url_parsed.scheme or (os.path.isabs(bucket_url) and "\\" in bucket_url): # this is a file so create a proper file url bucket_url = pathlib.Path(bucket_url).absolute().as_uri() @@ -215,22 +281,26 @@ def glob_files( glob_result = fs_client.glob(filter_url, detail=True) if isinstance(glob_result, list): raise NotImplementedError( - "Cannot request details when using fsspec.glob. For ADSL (Azure) please use version" + "Cannot request details when using fsspec.glob. For adlfs (Azure) please use version" " 2023.9.0 or later" ) for file, md in glob_result.items(): if md["type"] != "file": continue + # make that absolute path on a file:// if bucket_url_parsed.scheme == "file" and not file.startswith("/"): - file = "/" + file + file = f"/{file}" file_name = posixpath.relpath(file, bucket_path) - file_url = bucket_url_parsed.scheme + "://" + file + file_url = f"{bucket_url_parsed.scheme}://{file}" + + mime_type, encoding = guess_mime_type(file_name) yield FileItem( file_name=file_name, file_url=file_url, - mime_type=guess_mime_type(file_name), + mime_type=mime_type, + encoding=encoding, modification_date=MTIME_DISPATCH[bucket_url_parsed.scheme](md), size_in_bytes=int(md["size"]), ) diff --git a/dlt/common/time.py b/dlt/common/time.py index ed390c28bf..4f4dd05ef0 100644 --- a/dlt/common/time.py +++ b/dlt/common/time.py @@ -138,6 +138,43 @@ def ensure_pendulum_time(value: Union[str, datetime.time]) -> pendulum.Time: raise TypeError(f"Cannot coerce {value} to a pendulum.Time object.") +def to_py_datetime(value: datetime.datetime) -> datetime.datetime: + """Convert a pendulum.DateTime to a py datetime object. + + Args: + value: The value to convert. Can be a pendulum.DateTime or datetime. + + Returns: + A py datetime object + """ + if isinstance(value, pendulum.DateTime): + return datetime.datetime( + value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond, + value.tzinfo, + ) + return value + + +def to_py_date(value: datetime.date) -> datetime.date: + """Convert a pendulum.Date to a py date object. + + Args: + value: The value to convert. Can be a pendulum.Date or date. + + Returns: + A py date object + """ + if isinstance(value, pendulum.Date): + return datetime.date(value.year, value.month, value.day) + return value + + def _datetime_from_ts_or_iso( value: Union[int, float, str] ) -> Union[pendulum.DateTime, pendulum.Date, pendulum.Time]: diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index 980c4ce7f2..c0a0b419c1 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -10,6 +10,8 @@ from dlt.destinations.impl.qdrant.factory import qdrant from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate +from dlt.destinations.impl.synapse.factory import synapse +from dlt.destinations.impl.databricks.factory import databricks __all__ = [ @@ -25,4 +27,6 @@ "qdrant", "motherduck", "weaviate", + "synapse", + "databricks", ] diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index b8f12599dc..22c98d4f5a 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -2,5 +2,6 @@ from dlt.destinations.impl.weaviate import weaviate_adapter from dlt.destinations.impl.qdrant import qdrant_adapter +from dlt.destinations.impl.synapse import synapse_adapter -__all__ = ["weaviate_adapter", "qdrant_adapter"] +__all__ = ["weaviate_adapter", "qdrant_adapter", "synapse_adapter"] diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 4837f0dbdf..91525d771c 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -232,7 +232,7 @@ def drop_tables(self, *tables: str) -> None: statements = [ f"DROP TABLE IF EXISTS {self.make_qualified_ddl_table_name(table)};" for table in tables ] - self.execute_fragments(statements) + self.execute_many(statements) @contextmanager @raise_database_error @@ -351,9 +351,7 @@ def _from_db_type( return self.type_mapper.from_db_type(hive_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - return ( - f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" - ) + return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool @@ -378,15 +376,19 @@ def _get_table_update_sql( # use qualified table names qualified_table_name = self.sql_client.make_qualified_ddl_table_name(table_name) if is_iceberg and not generate_alter: - sql.append(f"""CREATE TABLE {qualified_table_name} + sql.append( + f"""CREATE TABLE {qualified_table_name} ({columns}) LOCATION '{location}' - TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""") + TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""" + ) elif not generate_alter: - sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name} + sql.append( + f"""CREATE EXTERNAL TABLE {qualified_table_name} ({columns}) STORED AS PARQUET - LOCATION '{location}';""") + LOCATION '{location}';""" + ) # alter table to add new columns at the end else: sql.append(f"""ALTER TABLE {qualified_table_name} ADD COLUMNS ({columns});""") diff --git a/dlt/destinations/impl/bigquery/README.md b/dlt/destinations/impl/bigquery/README.md index 47c54a690a..d949323a5b 100644 --- a/dlt/destinations/impl/bigquery/README.md +++ b/dlt/destinations/impl/bigquery/README.md @@ -1,6 +1,6 @@ # Loader account setup -1. Create new services account, add private key to it and download the `services.json` file +1. Create a new services account, add private key to it and download the `services.json` file 2. Make sure that this newly created account has access to BigQuery API -3. You must add following roles to the account above: `BigQuery Data Editor`, `BigQuey Job User` and `BigQuery Read Session User` (storage API) +3. You must add the following roles to the account above: `BigQuery Data Editor`, `BigQuey Job User` and `BigQuery Read Session User` (storage API) 4. IAM to add roles is here https://console.cloud.google.com/iam-admin/iam?project=chat-analytics-rasa-ci \ No newline at end of file diff --git a/dlt/destinations/impl/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py index 1304bd72bb..6d1491817a 100644 --- a/dlt/destinations/impl/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -20,5 +20,6 @@ def capabilities() -> DestinationCapabilitiesContext: caps.max_text_data_type_length = 10 * 1024 * 1024 caps.is_max_text_data_type_length_in_bytes = True caps.supports_ddl_transactions = False + caps.supports_clone_table = True return caps diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index fa4f5f0419..1058b1d2c9 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -1,9 +1,10 @@ import os from pathlib import Path -from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, cast, Type, Any +from typing import ClassVar, Optional, Sequence, Tuple, List, cast, Any + import google.cloud.bigquery as bigquery # noqa: I250 -from google.cloud import exceptions as gcp_exceptions from google.api_core import exceptions as api_core_exceptions +from google.cloud import exceptions as gcp_exceptions from dlt.common import json, logger from dlt.common.destination import DestinationCapabilitiesContext @@ -14,30 +15,27 @@ LoadJob, SupportsStagingDestination, ) -from dlt.common.data_types import TDataType -from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns -from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.schema.exceptions import UnknownTableException - -from dlt.destinations.job_client_impl import SqlJobClientWithStaging +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat +from dlt.common.schema.utils import table_schema_has_type +from dlt.common.storages.file_storage import FileStorage from dlt.destinations.exceptions import ( DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException, ) - from dlt.destinations.impl.bigquery import capabilities from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS -from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams +from dlt.destinations.job_client_impl import SqlJobClientWithStaging +from dlt.destinations.sql_jobs import SqlMergeJob, SqlJobParams from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams from dlt.destinations.type_mapping import TypeMapper -from dlt.common.schema.utils import table_schema_has_type - class BigQueryTypeMapper(TypeMapper): sct_to_unbound_dbt = { @@ -49,7 +47,7 @@ class BigQueryTypeMapper(TypeMapper): "timestamp": "TIMESTAMP", "bigint": "INTEGER", "binary": "BYTES", - "wei": "BIGNUMERIC", # non parametrized should hold wei values + "wei": "BIGNUMERIC", # non-parametrized should hold wei values "time": "TIME", } @@ -73,12 +71,12 @@ class BigQueryTypeMapper(TypeMapper): "TIME": "time", } + # noinspection PyTypeChecker,PydanticTypeChecker def from_db_type( self, db_type: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: - if db_type == "BIGNUMERIC": - if precision is None: # biggest numeric possible - return dict(data_type="wei") + if db_type == "BIGNUMERIC" and precision is None: + return dict(data_type="wei") return super().from_db_type(db_type, precision, scale) @@ -96,29 +94,24 @@ def __init__( super().__init__(file_name) def state(self) -> TLoadJobState: - # check server if done - done = self.bq_load_job.done(retry=self.default_retry, timeout=self.http_timeout) - if done: - # rows processed - if self.bq_load_job.output_rows is not None and self.bq_load_job.error_result is None: - return "completed" - else: - reason = self.bq_load_job.error_result.get("reason") - if reason in BQ_TERMINAL_REASONS: - # the job permanently failed for the reason above - return "failed" - elif reason in ["internalError"]: - logger.warning( - f"Got reason {reason} for job {self.file_name}, job considered still" - f" running. ({self.bq_load_job.error_result})" - ) - # status of the job could not be obtained, job still running - return "running" - else: - # retry on all other reasons, including `backendError` which requires retry when the job is done - return "retry" - else: + if not self.bq_load_job.done(retry=self.default_retry, timeout=self.http_timeout): return "running" + if self.bq_load_job.output_rows is not None and self.bq_load_job.error_result is None: + return "completed" + reason = self.bq_load_job.error_result.get("reason") + if reason in BQ_TERMINAL_REASONS: + # the job permanently failed for the reason above + return "failed" + elif reason in ["internalError"]: + logger.warning( + f"Got reason {reason} for job {self.file_name}, job considered still" + f" running. ({self.bq_load_job.error_result})" + ) + # the status of the job couldn't be obtained, job still running + return "running" + else: + # retry on all other reasons, including `backendError` which requires retry when the job is done + return "retry" def bigquery_job_id(self) -> str: return BigQueryLoadJob.get_job_id_from_file_path(super().file_name()) @@ -149,33 +142,11 @@ def gen_key_table_clauses( key_clauses: Sequence[str], for_delete: bool, ) -> List[str]: - # generate several clauses: BigQuery does not support OR nor unions - sql: List[str] = [] - for clause in key_clauses: - sql.append( - f"FROM {root_table_name} AS d WHERE EXISTS (SELECT 1 FROM" - f" {staging_root_table_name} AS s WHERE {clause.format(d='d', s='s')})" - ) - return sql - - -class BigqueryStagingCopyJob(SqlStagingCopyJob): - @classmethod - def generate_sql( - cls, - table_chain: Sequence[TTableSchema], - sql_client: SqlClientBase[Any], - params: Optional[SqlJobParams] = None, - ) -> List[str]: - sql: List[str] = [] - for table in table_chain: - with sql_client.with_staging_dataset(staging=True): - staging_table_name = sql_client.make_qualified_table_name(table["name"]) - table_name = sql_client.make_qualified_table_name(table["name"]) - # drop destination table - sql.append(f"DROP TABLE IF EXISTS {table_name};") - # recreate destination table with data cloned from staging table - sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") + sql: List[str] = [ + f"FROM {root_table_name} AS d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} AS" + f" s WHERE {clause.format(d='d', s='s')})" + for clause in key_clauses + ] return sql @@ -198,17 +169,11 @@ def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)] - def _create_replace_followup_jobs( - self, table_chain: Sequence[TTableSchema] - ) -> List[NewLoadJob]: - if self.config.replace_strategy == "staging-optimized": - return [BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client)] - return super()._create_replace_followup_jobs(table_chain) - def restore_file_load(self, file_path: str) -> LoadJob: """Returns a completed SqlLoadJob or restored BigQueryLoadJob - See base class for details on SqlLoadJob. BigQueryLoadJob is restored with job id derived from `file_path` + See base class for details on SqlLoadJob. + BigQueryLoadJob is restored with a job id derived from `file_path` Args: file_path (str): a path to a job file @@ -228,11 +193,13 @@ def restore_file_load(self, file_path: str) -> LoadJob: except api_core_exceptions.GoogleAPICallError as gace: reason = BigQuerySqlClient._get_reason_from_errors(gace) if reason == "notFound": - raise LoadJobNotExistsException(file_path) + raise LoadJobNotExistsException(file_path) from gace elif reason in BQ_TERMINAL_REASONS: - raise LoadJobTerminalException(file_path, f"The server reason was: {reason}") + raise LoadJobTerminalException( + file_path, f"The server reason was: {reason}" + ) from gace else: - raise DestinationTransientException(gace) + raise DestinationTransientException(gace) from gace return job def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: @@ -250,15 +217,17 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> reason = BigQuerySqlClient._get_reason_from_errors(gace) if reason == "notFound": # google.api_core.exceptions.NotFound: 404 - table not found - raise UnknownTableException(table["name"]) + raise UnknownTableException(table["name"]) from gace elif reason == "duplicate": # google.api_core.exceptions.Conflict: 409 PUT - already exists return self.restore_file_load(file_path) elif reason in BQ_TERMINAL_REASONS: # google.api_core.exceptions.BadRequest - will not be processed ie bad job name - raise LoadJobTerminalException(file_path, f"The server reason was: {reason}") + raise LoadJobTerminalException( + file_path, f"The server reason was: {reason}" + ) from gace else: - raise DestinationTransientException(gace) + raise DestinationTransientException(gace) from gace return job def _get_table_update_sql( @@ -274,30 +243,36 @@ def _get_table_update_sql( cluster_list = [ self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") ] - partition_list = [ - self.capabilities.escape_identifier(c["name"]) - for c in new_columns - if c.get("partition") - ] - - # partition by must be added first - if len(partition_list) > 0: + if partition_list := [c for c in new_columns if c.get("partition")]: if len(partition_list) > 1: + col_names = [self.capabilities.escape_identifier(c["name"]) for c in partition_list] raise DestinationSchemaWillNotUpdate( - canonical_name, partition_list, "Partition requested for more than one column" + canonical_name, col_names, "Partition requested for more than one column" + ) + elif (c := partition_list[0])["data_type"] == "date": + sql[0] = f"{sql[0]}\nPARTITION BY {self.capabilities.escape_identifier(c['name'])}" + elif (c := partition_list[0])["data_type"] == "timestamp": + sql[ + 0 + ] = f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})" + # Automatic partitioning of an INT64 type requires us to be prescriptive - we treat the column as a UNIX timestamp. + # This is due to the bounds requirement of GENERATE_ARRAY function for partitioning. + # The 10,000 partitions limit makes it infeasible to cover the entire `bigint` range. + # The array bounds, with daily partitions (86400 seconds in a day), are somewhat arbitrarily chosen. + # See: https://dlthub.com/devel/dlt-ecosystem/destinations/bigquery#supported-column-hints + elif (c := partition_list[0])["data_type"] == "bigint": + sql[0] = ( + f"{sql[0]}\nPARTITION BY" + f" RANGE_BUCKET({self.capabilities.escape_identifier(c['name'])}," + " GENERATE_ARRAY(-172800000, 691200000, 86400))" ) - else: - sql[0] = sql[0] + f"\nPARTITION BY DATE({partition_list[0]})" - if len(cluster_list) > 0: + if cluster_list: sql[0] = sql[0] + "\nCLUSTER BY " + ",".join(cluster_list) - return sql def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: name = self.capabilities.escape_identifier(c["name"]) - return ( - f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}" - ) + return f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}" def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: schema_table: TTableSchemaColumns = {} @@ -329,14 +304,14 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load # append to table for merge loads (append to stage) and regular appends table_name = table["name"] - # determine wether we load from local or uri + # determine whether we load from local or uri bucket_path = None ext: str = os.path.splitext(file_path)[1][1:] if NewReferenceJob.is_reference_job(file_path): bucket_path = NewReferenceJob.resolve_reference(file_path) ext = os.path.splitext(bucket_path)[1][1:] - # choose correct source format + # choose a correct source format source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON decimal_target_types: List[str] = None if ext == "parquet": @@ -347,7 +322,7 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load "Bigquery cannot load into JSON data type from parquet. Use jsonl instead.", ) source_format = bigquery.SourceFormat.PARQUET - # parquet needs NUMERIC type autodetection + # parquet needs NUMERIC type auto-detection decimal_target_types = ["NUMERIC", "BIGNUMERIC"] job_id = BigQueryLoadJob.get_job_id_from_file_path(file_path) diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index bf41d38aff..0dbc8959c2 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -52,5 +52,17 @@ def __init__( file_upload_timeout: float = 30 * 60.0, retry_deadline: float = 60.0, destination_name: str = None, - environment: str = None, - ) -> None: ... + environment: str = None + ) -> None: + super().__init__( + credentials=credentials, + dataset_name=dataset_name, + default_schema_name=default_schema_name, + destination_name=destination_name, + environment=environment, + ) + self.retry_deadline = retry_deadline + self.file_upload_timeout = file_upload_timeout + self.http_timeout = http_timeout + self.location = location + ... diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py index fc92c3c087..bee55fa164 100644 --- a/dlt/destinations/impl/bigquery/factory.py +++ b/dlt/destinations/impl/bigquery/factory.py @@ -9,6 +9,7 @@ from dlt.destinations.impl.bigquery.bigquery import BigQueryClient +# noinspection PyPep8Naming class bigquery(Destination[BigQueryClientConfiguration, "BigQueryClient"]): spec = BigQueryClientConfiguration diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index cf5d2ecbd4..ce38e3fe29 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -1,31 +1,30 @@ from contextlib import contextmanager -from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence, Type +from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence import google.cloud.bigquery as bigquery # noqa: I250 +from google.api_core import exceptions as api_core_exceptions +from google.cloud import exceptions as gcp_exceptions from google.cloud.bigquery import dbapi as bq_dbapi from google.cloud.bigquery.dbapi import Connection as DbApiConnection, Cursor as BQDbApiCursor -from google.cloud import exceptions as gcp_exceptions from google.cloud.bigquery.dbapi import exceptions as dbapi_exceptions -from google.api_core import exceptions as api_core_exceptions from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.typing import StrAny - -from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.exceptions import ( DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, ) +from dlt.destinations.impl.bigquery import capabilities from dlt.destinations.sql_client import ( DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error, ) +from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.impl.bigquery import capabilities # terminal reasons as returned in BQ gRPC error response # https://cloud.google.com/bigquery/docs/error-messages @@ -38,7 +37,7 @@ "stopped", "tableUnavailable", ] -# invalidQuery is an transient error -> must be fixed by programmer +# invalidQuery is a transient error -> must be fixed by programmer class BigQueryDBApiCursorImpl(DBApiCursorImpl): @@ -47,16 +46,15 @@ class BigQueryDBApiCursorImpl(DBApiCursorImpl): native_cursor: BQDbApiCursor # type: ignore def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: + if chunk_size is not None: + return super().df(chunk_size=chunk_size) query_job: bigquery.QueryJob = self.native_cursor._query_job - if chunk_size is None: - try: - return query_job.to_dataframe(**kwargs) - except ValueError: - # no pyarrow/db-types, fallback to our implementation - return super().df() - else: - return super().df(chunk_size=chunk_size) + try: + return query_job.to_dataframe(**kwargs) + except ValueError: + # no pyarrow/db-types, fallback to our implementation + return super().df() class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction): @@ -116,34 +114,32 @@ def close_connection(self) -> None: @raise_database_error def begin_transaction(self) -> Iterator[DBTransaction]: try: - # start the transaction if not yet started - if not self._session_query: - job = self._client.query( - "BEGIN TRANSACTION;", - job_config=bigquery.QueryJobConfig( - create_session=True, - default_dataset=self.fully_qualified_dataset_name(escape=False), - ), - ) - self._session_query = bigquery.QueryJobConfig( - create_session=False, - default_dataset=self.fully_qualified_dataset_name(escape=False), - connection_properties=[ - bigquery.query.ConnectionProperty( - key="session_id", value=job.session_info.session_id - ) - ], - ) - try: - job.result() - except Exception: - # if session creation fails - self._session_query = None - raise - else: + if self._session_query: raise dbapi_exceptions.ProgrammingError( "Nested transactions not supported on BigQuery" ) + job = self._client.query( + "BEGIN TRANSACTION;", + job_config=bigquery.QueryJobConfig( + create_session=True, + default_dataset=self.fully_qualified_dataset_name(escape=False), + ), + ) + self._session_query = bigquery.QueryJobConfig( + create_session=False, + default_dataset=self.fully_qualified_dataset_name(escape=False), + connection_properties=[ + bigquery.query.ConnectionProperty( + key="session_id", value=job.session_info.session_id + ) + ], + ) + try: + job.result() + except Exception: + # if session creation fails + self._session_query = None + raise yield self self.commit_transaction() except Exception: @@ -152,7 +148,7 @@ def begin_transaction(self) -> Iterator[DBTransaction]: def commit_transaction(self) -> None: if not self._session_query: - # allow to commit without transaction + # allow committing without transaction return self.execute_sql("COMMIT TRANSACTION;CALL BQ.ABORT_SESSION();") self._session_query = None @@ -181,7 +177,6 @@ def has_dataset(self) -> bool: def create_dataset(self) -> None: self._client.create_dataset( self.fully_qualified_dataset_name(escape=False), - exists_ok=False, retry=self._default_retry, timeout=self.http_timeout, ) @@ -201,21 +196,18 @@ def execute_sql( with self.execute_query(sql, *args, **kwargs) as curr: if not curr.description: return None - else: - try: - f = curr.fetchall() - return f - except api_core_exceptions.InvalidArgument as ia_ex: - if "non-table entities cannot be read" in str(ia_ex): - return None - raise + try: + return curr.fetchall() + except api_core_exceptions.InvalidArgument as ia_ex: + if "non-table entities cannot be read" in str(ia_ex): + return None + raise @contextmanager @raise_database_error def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]: conn: DbApiConnection = None - curr: DBApiCursor = None - db_args = args if args else kwargs if kwargs else None + db_args = args or (kwargs or None) try: conn = DbApiConnection(client=self._client) curr = conn.cursor() @@ -238,37 +230,37 @@ def fully_qualified_dataset_name(self, escape: bool = True) -> str: @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: - if cls.is_dbapi_exception(ex): - # google cloud exception in first argument: https://github.com/googleapis/python-bigquery/blob/main/google/cloud/bigquery/dbapi/cursor.py#L205 - cloud_ex = ex.args[0] - reason = cls._get_reason_from_errors(cloud_ex) - if reason is None: - if isinstance(ex, (dbapi_exceptions.DataError, dbapi_exceptions.IntegrityError)): - return DatabaseTerminalException(ex) - elif isinstance(ex, dbapi_exceptions.ProgrammingError): - return DatabaseTransientException(ex) - if reason == "notFound": - return DatabaseUndefinedRelation(ex) - if reason == "invalidQuery" and "was not found" in str(ex) and "Dataset" in str(ex): - return DatabaseUndefinedRelation(ex) - if ( - reason == "invalidQuery" - and "Not found" in str(ex) - and ("Dataset" in str(ex) or "Table" in str(ex)) - ): - return DatabaseUndefinedRelation(ex) - if reason == "accessDenied" and "Dataset" in str(ex) and "not exist" in str(ex): - return DatabaseUndefinedRelation(ex) - if reason == "invalidQuery" and ( - "Unrecognized name" in str(ex) or "cannot be null" in str(ex) - ): - # unknown column, inserting NULL into required field - return DatabaseTerminalException(ex) - if reason in BQ_TERMINAL_REASONS: + if not cls.is_dbapi_exception(ex): + return ex + # google cloud exception in first argument: https://github.com/googleapis/python-bigquery/blob/main/google/cloud/bigquery/dbapi/cursor.py#L205 + cloud_ex = ex.args[0] + reason = cls._get_reason_from_errors(cloud_ex) + if reason is None: + if isinstance(ex, (dbapi_exceptions.DataError, dbapi_exceptions.IntegrityError)): return DatabaseTerminalException(ex) - # anything else is transient - return DatabaseTransientException(ex) - return ex + elif isinstance(ex, dbapi_exceptions.ProgrammingError): + return DatabaseTransientException(ex) + if reason == "notFound": + return DatabaseUndefinedRelation(ex) + if reason == "invalidQuery" and "was not found" in str(ex) and "Dataset" in str(ex): + return DatabaseUndefinedRelation(ex) + if ( + reason == "invalidQuery" + and "Not found" in str(ex) + and ("Dataset" in str(ex) or "Table" in str(ex)) + ): + return DatabaseUndefinedRelation(ex) + if reason == "accessDenied" and "Dataset" in str(ex) and "not exist" in str(ex): + return DatabaseUndefinedRelation(ex) + if reason == "invalidQuery" and ( + "Unrecognized name" in str(ex) or "cannot be null" in str(ex) + ): + # unknown column, inserting NULL into required field + return DatabaseTerminalException(ex) + if reason in BQ_TERMINAL_REASONS: + return DatabaseTerminalException(ex) + # anything else is transient + return DatabaseTransientException(ex) @staticmethod def _get_reason_from_errors(gace: api_core_exceptions.GoogleAPICallError) -> Optional[str]: diff --git a/dlt/destinations/impl/databricks/__init__.py b/dlt/destinations/impl/databricks/__init__.py new file mode 100644 index 0000000000..f63d294818 --- /dev/null +++ b/dlt/destinations/impl/databricks/__init__.py @@ -0,0 +1,30 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_databricks_identifier, escape_databricks_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE + +from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = "jsonl" + caps.supported_staging_file_formats = ["jsonl", "parquet"] + caps.escape_identifier = escape_databricks_identifier + caps.escape_literal = escape_databricks_literal + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + caps.max_query_length = 2 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 16 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + caps.supports_truncate_command = True + # caps.supports_transactions = False + caps.alter_add_multi_column = True + caps.supports_multiple_statements = False + caps.supports_clone_table = True + return caps diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py new file mode 100644 index 0000000000..924047e30f --- /dev/null +++ b/dlt/destinations/impl/databricks/configuration.py @@ -0,0 +1,51 @@ +from typing import ClassVar, Final, Optional, Any, Dict, List + +from dlt.common.typing import TSecretStrValue +from dlt.common.configuration.exceptions import ConfigurationValueError +from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec +from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration + + +@configspec +class DatabricksCredentials(CredentialsConfiguration): + catalog: str = None + server_hostname: str = None + http_path: str = None + access_token: Optional[TSecretStrValue] = None + http_headers: Optional[Dict[str, str]] = None + session_configuration: Optional[Dict[str, Any]] = None + """Dict of session parameters that will be passed to `databricks.sql.connect`""" + connection_parameters: Optional[Dict[str, Any]] = None + """Additional keyword arguments that are passed to `databricks.sql.connect`""" + socket_timeout: Optional[int] = 180 + + __config_gen_annotations__: ClassVar[List[str]] = [ + "server_hostname", + "http_path", + "catalog", + "access_token", + ] + + def to_connector_params(self) -> Dict[str, Any]: + return dict( + catalog=self.catalog, + server_hostname=self.server_hostname, + http_path=self.http_path, + access_token=self.access_token, + session_configuration=self.session_configuration or {}, + _socket_timeout=self.socket_timeout, + **(self.connection_parameters or {}), + ) + + +@configspec +class DatabricksClientConfiguration(DestinationClientDwhWithStagingConfiguration): + destination_type: Final[str] = "databricks" # type: ignore[misc] + credentials: DatabricksCredentials + + def __str__(self) -> str: + """Return displayable destination location""" + if self.staging_config: + return str(self.staging_config.credentials) + else: + return "[no staging set]" diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py new file mode 100644 index 0000000000..b5a404302f --- /dev/null +++ b/dlt/destinations/impl/databricks/databricks.py @@ -0,0 +1,317 @@ +from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any, Iterable, Type, cast +from urllib.parse import urlparse, urlunparse + +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import ( + FollowupJob, + NewLoadJob, + TLoadJobState, + LoadJob, + CredentialsConfiguration, + SupportsStagingDestination, +) +from dlt.common.configuration.specs import ( + AwsCredentialsWithoutDefaults, + AzureCredentials, + AzureCredentialsWithoutDefaults, +) +from dlt.common.data_types import TDataType +from dlt.common.storages.file_storage import FileStorage +from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchema, TColumnType, TSchemaTables, TTableFormat +from dlt.common.schema.utils import table_schema_has_type + + +from dlt.destinations.insert_job_client import InsertValuesJobClient +from dlt.destinations.job_impl import EmptyLoadJob +from dlt.destinations.exceptions import LoadJobTerminalException + +from dlt.destinations.impl.databricks import capabilities +from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration +from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient +from dlt.destinations.sql_jobs import SqlMergeJob, SqlJobParams +from dlt.destinations.job_impl import NewReferenceJob +from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.type_mapping import TypeMapper +from dlt.common.storages import FilesystemConfiguration, fsspec_from_config +from dlt import config + + +class DatabricksTypeMapper(TypeMapper): + sct_to_unbound_dbt = { + "complex": "STRING", # Databricks supports complex types like ARRAY + "text": "STRING", + "double": "DOUBLE", + "bool": "BOOLEAN", + "date": "DATE", + "timestamp": "TIMESTAMP", # TIMESTAMP for local timezone + "bigint": "BIGINT", + "binary": "BINARY", + "decimal": "DECIMAL", # DECIMAL(p,s) format + "time": "STRING", + } + + dbt_to_sct = { + "STRING": "text", + "DOUBLE": "double", + "BOOLEAN": "bool", + "DATE": "date", + "TIMESTAMP": "timestamp", + "BIGINT": "bigint", + "INT": "bigint", + "SMALLINT": "bigint", + "TINYINT": "bigint", + "BINARY": "binary", + "DECIMAL": "decimal", + } + + sct_to_dbt = { + "decimal": "DECIMAL(%i,%i)", + "wei": "DECIMAL(%i,%i)", + } + + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: + if precision is None: + return "BIGINT" + if precision <= 8: + return "TINYINT" + if precision <= 16: + return "SMALLINT" + if precision <= 32: + return "INT" + return "BIGINT" + + def from_db_type( + self, db_type: str, precision: Optional[int] = None, scale: Optional[int] = None + ) -> TColumnType: + # precision and scale arguments here are meaningless as they're not included separately in information schema + # We use full_data_type from databricks which is either in form "typename" or "typename(precision, scale)" + type_parts = db_type.split("(") + if len(type_parts) > 1: + db_type = type_parts[0] + scale_str = type_parts[1].strip(")") + precision, scale = [int(val) for val in scale_str.split(",")] + else: + scale = precision = None + db_type = db_type.upper() + if db_type == "DECIMAL": + if (precision, scale) == self.wei_precision(): + return dict(data_type="wei", precision=precision, scale=scale) + return super().from_db_type(db_type, precision, scale) + + +class DatabricksLoadJob(LoadJob, FollowupJob): + def __init__( + self, + table: TTableSchema, + file_path: str, + table_name: str, + load_id: str, + client: DatabricksSqlClient, + staging_config: FilesystemConfiguration, + ) -> None: + file_name = FileStorage.get_file_name_from_file_path(file_path) + super().__init__(file_name) + staging_credentials = staging_config.credentials + + qualified_table_name = client.make_qualified_table_name(table_name) + + # extract and prepare some vars + bucket_path = orig_bucket_path = ( + NewReferenceJob.resolve_reference(file_path) + if NewReferenceJob.is_reference_job(file_path) + else "" + ) + file_name = ( + FileStorage.get_file_name_from_file_path(bucket_path) if bucket_path else file_name + ) + from_clause = "" + credentials_clause = "" + format_options_clause = "" + + if bucket_path: + bucket_url = urlparse(bucket_path) + bucket_scheme = bucket_url.scheme + # referencing an staged files via a bucket URL requires explicit AWS credentials + if bucket_scheme == "s3" and isinstance( + staging_credentials, AwsCredentialsWithoutDefaults + ): + s3_creds = staging_credentials.to_session_credentials() + credentials_clause = f"""WITH(CREDENTIAL( + AWS_ACCESS_KEY='{s3_creds["aws_access_key_id"]}', + AWS_SECRET_KEY='{s3_creds["aws_secret_access_key"]}', + + AWS_SESSION_TOKEN='{s3_creds["aws_session_token"]}' + )) + """ + from_clause = f"FROM '{bucket_path}'" + elif bucket_scheme in ["az", "abfs"] and isinstance( + staging_credentials, AzureCredentialsWithoutDefaults + ): + # Explicit azure credentials are needed to load from bucket without a named stage + credentials_clause = f"""WITH(CREDENTIAL(AZURE_SAS_TOKEN='{staging_credentials.azure_storage_sas_token}'))""" + # Converts an az:/// to abfss://@.dfs.core.windows.net/ + # as required by snowflake + _path = bucket_url.path + bucket_path = urlunparse( + bucket_url._replace( + scheme="abfss", + netloc=f"{bucket_url.netloc}@{staging_credentials.azure_storage_account_name}.dfs.core.windows.net", + path=_path, + ) + ) + from_clause = f"FROM '{bucket_path}'" + else: + raise LoadJobTerminalException( + file_path, + f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and azure buckets are supported", + ) + else: + raise LoadJobTerminalException( + file_path, + "Cannot load from local file. Databricks does not support loading from local files. Configure staging with an s3 or azure storage bucket.", + ) + + # decide on source format, stage_file_path will either be a local file or a bucket path + if file_name.endswith(".parquet"): + source_format = "PARQUET" # Only parquet is supported + elif file_name.endswith(".jsonl"): + if not config.get("data_writer.disable_compression"): + raise LoadJobTerminalException( + file_path, + "Databricks loader does not support gzip compressed JSON files. Please disable compression in the data writer configuration: https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression", + ) + if table_schema_has_type(table, "decimal"): + raise LoadJobTerminalException( + file_path, + "Databricks loader cannot load DECIMAL type columns from json files. Switch to parquet format to load decimals.", + ) + if table_schema_has_type(table, "binary"): + raise LoadJobTerminalException( + file_path, + "Databricks loader cannot load BINARY type columns from json files. Switch to parquet format to load byte values.", + ) + if table_schema_has_type(table, "complex"): + raise LoadJobTerminalException( + file_path, + "Databricks loader cannot load complex columns (lists and dicts) from json files. Switch to parquet format to load complex types.", + ) + if table_schema_has_type(table, "date"): + raise LoadJobTerminalException( + file_path, + "Databricks loader cannot load DATE type columns from json files. Switch to parquet format to load dates.", + ) + + source_format = "JSON" + format_options_clause = "FORMAT_OPTIONS('inferTimestamp'='true')" + # Databricks fails when trying to load empty json files, so we have to check the file size + fs, _ = fsspec_from_config(staging_config) + file_size = fs.size(orig_bucket_path) + if file_size == 0: # Empty file, do nothing + return + + statement = f"""COPY INTO {qualified_table_name} + {from_clause} + {credentials_clause} + FILEFORMAT = {source_format} + {format_options_clause} + """ + client.execute_sql(statement) + + def state(self) -> TLoadJobState: + return "completed" + + def exception(self) -> str: + raise NotImplementedError() + + +class DatabricksMergeJob(SqlMergeJob): + @classmethod + def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: + return f"CREATE TEMPORARY VIEW {temp_table_name} AS {select_sql};" + + @classmethod + def gen_delete_from_sql( + cls, table_name: str, column_name: str, temp_table_name: str, temp_table_column: str + ) -> str: + # Databricks does not support subqueries in DELETE FROM statements so we use a MERGE statement instead + return f"""MERGE INTO {table_name} + USING {temp_table_name} + ON {table_name}.{column_name} = {temp_table_name}.{temp_table_column} + WHEN MATCHED THEN DELETE; + """ + + +class DatabricksClient(InsertValuesJobClient, SupportsStagingDestination): + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, schema: Schema, config: DatabricksClientConfiguration) -> None: + sql_client = DatabricksSqlClient(config.normalize_dataset_name(schema), config.credentials) + super().__init__(schema, config, sql_client) + self.config: DatabricksClientConfiguration = config + self.sql_client: DatabricksSqlClient = sql_client + self.type_mapper = DatabricksTypeMapper(self.capabilities) + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + job = super().start_file_load(table, file_path, load_id) + + if not job: + job = DatabricksLoadJob( + table, + file_path, + table["name"], + load_id, + self.sql_client, + staging_config=cast(FilesystemConfiguration, self.config.staging_config), + ) + return job + + def restore_file_load(self, file_path: str) -> LoadJob: + return EmptyLoadJob.from_file_path(file_path, "completed") + + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [DatabricksMergeJob.from_table_chain(table_chain, self.sql_client)] + + def _make_add_column_sql( + self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None + ) -> List[str]: + # Override because databricks requires multiple columns in a single ADD COLUMN clause + return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)] + + def _get_table_update_sql( + self, + table_name: str, + new_columns: Sequence[TColumnSchema], + generate_alter: bool, + separate_alters: bool = False, + ) -> List[str]: + sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) + + cluster_list = [ + self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") + ] + + if cluster_list: + sql[0] = sql[0] + "\nCLUSTER BY (" + ",".join(cluster_list) + ")" + + return sql + + def _from_db_type( + self, bq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: + return self.type_mapper.from_db_type(bq_t, precision, scale) + + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: + name = self.capabilities.escape_identifier(c["name"]) + return ( + f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" + ) + + def _get_storage_table_query_columns(self) -> List[str]: + fields = super()._get_storage_table_query_columns() + fields[ + 1 + ] = "full_data_type" # Override because this is the only way to get data type with precision + return fields diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py new file mode 100644 index 0000000000..7c6c95137d --- /dev/null +++ b/dlt/destinations/impl/databricks/factory.py @@ -0,0 +1,48 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.databricks.configuration import ( + DatabricksCredentials, + DatabricksClientConfiguration, +) +from dlt.destinations.impl.databricks import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.databricks.databricks import DatabricksClient + + +class databricks(Destination[DatabricksClientConfiguration, "DatabricksClient"]): + spec = DatabricksClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["DatabricksClient"]: + from dlt.destinations.impl.databricks.databricks import DatabricksClient + + return DatabricksClient + + def __init__( + self, + credentials: t.Union[DatabricksCredentials, t.Dict[str, t.Any], str] = None, + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Databricks destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the databricks database. Can be an instance of `DatabricksCredentials` or + a connection string in the format `databricks://user:password@host:port/database` + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + credentials=credentials, + destination_name=destination_name, + environment=environment, + **kwargs, + ) diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py new file mode 100644 index 0000000000..68ea863cc4 --- /dev/null +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -0,0 +1,155 @@ +from contextlib import contextmanager, suppress +from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List, Union, Dict + +from databricks import sql as databricks_lib +from databricks.sql.client import ( + Connection as DatabricksSqlConnection, + Cursor as DatabricksSqlCursor, +) +from databricks.sql.exc import Error as DatabricksSqlError + +from dlt.common import pendulum +from dlt.common import logger +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) +from dlt.destinations.sql_client import ( + DBApiCursorImpl, + SqlClientBase, + raise_database_error, + raise_open_connection_error, +) +from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction +from dlt.destinations.impl.databricks.configuration import DatabricksCredentials +from dlt.destinations.impl.databricks import capabilities +from dlt.common.time import to_py_date, to_py_datetime + + +class DatabricksSqlClient(SqlClientBase[DatabricksSqlConnection], DBTransaction): + dbapi: ClassVar[DBApi] = databricks_lib + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, dataset_name: str, credentials: DatabricksCredentials) -> None: + super().__init__(credentials.catalog, dataset_name) + self._conn: DatabricksSqlConnection = None + self.credentials = credentials + + def open_connection(self) -> DatabricksSqlConnection: + conn_params = self.credentials.to_connector_params() + self._conn = databricks_lib.connect(**conn_params, schema=self.dataset_name) + return self._conn + + @raise_open_connection_error + def close_connection(self) -> None: + if self._conn: + self._conn.close() + self._conn = None + + @contextmanager + def begin_transaction(self) -> Iterator[DBTransaction]: + # Databricks does not support transactions + yield self + + @raise_database_error + def commit_transaction(self) -> None: + # Databricks does not support transactions + pass + + @raise_database_error + def rollback_transaction(self) -> None: + # Databricks does not support transactions + pass + + @property + def native_connection(self) -> "DatabricksSqlConnection": + return self._conn + + def drop_dataset(self) -> None: + self.execute_sql("DROP SCHEMA IF EXISTS %s CASCADE;" % self.fully_qualified_dataset_name()) + + def drop_tables(self, *tables: str) -> None: + # Tables are drop with `IF EXISTS`, but databricks raises when the schema doesn't exist. + # Multi statement exec is safe and the error can be ignored since all tables are in the same schema. + with suppress(DatabaseUndefinedRelation): + super().drop_tables(*tables) + + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: + with self.execute_query(sql, *args, **kwargs) as curr: + if curr.description is None: + return None + else: + f = curr.fetchall() + return f + + @contextmanager + @raise_database_error + def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]: + curr: DBApiCursor = None + # TODO: databricks connector 3.0.0 will use :named paramstyle only + # if args: + # keys = [f"arg{i}" for i in range(len(args))] + # # Replace position arguments (%s) with named arguments (:arg0, :arg1, ...) + # # query = query % tuple(f":{key}" for key in keys) + # db_args = {} + # for key, db_arg in zip(keys, args): + # # Databricks connector doesn't accept pendulum objects + # if isinstance(db_arg, pendulum.DateTime): + # db_arg = to_py_datetime(db_arg) + # elif isinstance(db_arg, pendulum.Date): + # db_arg = to_py_date(db_arg) + # db_args[key] = db_arg + # else: + # db_args = None + db_args: Optional[Union[Dict[str, Any], Sequence[Any]]] + if kwargs: + db_args = kwargs + elif args: + db_args = args + else: + db_args = None + with self._conn.cursor() as curr: + curr.execute(query, db_args) + yield DBApiCursorImpl(curr) # type: ignore[abstract] + + def fully_qualified_dataset_name(self, escape: bool = True) -> str: + if escape: + catalog = self.capabilities.escape_identifier(self.credentials.catalog) + dataset_name = self.capabilities.escape_identifier(self.dataset_name) + else: + catalog = self.credentials.catalog + dataset_name = self.dataset_name + return f"{catalog}.{dataset_name}" + + @staticmethod + def _make_database_exception(ex: Exception) -> Exception: + if isinstance(ex, databricks_lib.ServerOperationError): + if "TABLE_OR_VIEW_NOT_FOUND" in str(ex): + return DatabaseUndefinedRelation(ex) + elif "SCHEMA_NOT_FOUND" in str(ex): + return DatabaseUndefinedRelation(ex) + elif "PARSE_SYNTAX_ERROR" in str(ex): + return DatabaseTransientException(ex) + return DatabaseTerminalException(ex) + elif isinstance(ex, databricks_lib.OperationalError): + return DatabaseTerminalException(ex) + elif isinstance(ex, (databricks_lib.ProgrammingError, databricks_lib.IntegrityError)): + return DatabaseTerminalException(ex) + elif isinstance(ex, databricks_lib.DatabaseError): + return DatabaseTransientException(ex) + else: + return DatabaseTransientException(ex) + + @staticmethod + def _maybe_make_terminal_exception_from_data_error( + databricks_ex: databricks_lib.DatabaseError, + ) -> Optional[Exception]: + return None + + @staticmethod + def is_dbapi_exception(ex: Exception) -> bool: + return isinstance(ex, databricks_lib.DatabaseError) diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index f33aca4b82..f00998cfb2 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -1,4 +1,4 @@ -from typing import Final, ClassVar, Any, List, Optional, TYPE_CHECKING +from typing import Final, ClassVar, Any, List, Dict, Optional, TYPE_CHECKING from sqlalchemy.engine import URL from dlt.common.configuration import configspec @@ -10,9 +10,6 @@ from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -SUPPORTED_DRIVERS = ["ODBC Driver 18 for SQL Server", "ODBC Driver 17 for SQL Server"] - - @configspec class MsSqlCredentials(ConnectionStringCredentials): drivername: Final[str] = "mssql" # type: ignore @@ -24,22 +21,27 @@ class MsSqlCredentials(ConnectionStringCredentials): __config_gen_annotations__: ClassVar[List[str]] = ["port", "connect_timeout"] + SUPPORTED_DRIVERS: ClassVar[List[str]] = [ + "ODBC Driver 18 for SQL Server", + "ODBC Driver 17 for SQL Server", + ] + def parse_native_representation(self, native_value: Any) -> None: # TODO: Support ODBC connection string or sqlalchemy URL super().parse_native_representation(native_value) if self.query is not None: self.query = {k.lower(): v for k, v in self.query.items()} # Make case-insensitive. - if "driver" in self.query and self.query.get("driver") not in SUPPORTED_DRIVERS: - raise SystemConfigurationException( - f"""The specified driver "{self.query.get('driver')}" is not supported.""" - f" Choose one of the supported drivers: {', '.join(SUPPORTED_DRIVERS)}." - ) self.driver = self.query.get("driver", self.driver) self.connect_timeout = int(self.query.get("connect_timeout", self.connect_timeout)) if not self.is_partial(): self.resolve() def on_resolved(self) -> None: + if self.driver not in self.SUPPORTED_DRIVERS: + raise SystemConfigurationException( + f"""The specified driver "{self.driver}" is not supported.""" + f" Choose one of the supported drivers: {', '.join(self.SUPPORTED_DRIVERS)}." + ) self.database = self.database.lower() def to_url(self) -> URL: @@ -55,20 +57,21 @@ def on_partial(self) -> None: def _get_driver(self) -> str: if self.driver: return self.driver + # Pick a default driver if available import pyodbc available_drivers = pyodbc.drivers() - for d in SUPPORTED_DRIVERS: + for d in self.SUPPORTED_DRIVERS: if d in available_drivers: return d docs_url = "https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16" raise SystemConfigurationException( f"No supported ODBC driver found for MS SQL Server. See {docs_url} for information on" - f" how to install the '{SUPPORTED_DRIVERS[0]}' on your platform." + f" how to install the '{self.SUPPORTED_DRIVERS[0]}' on your platform." ) - def to_odbc_dsn(self) -> str: + def _get_odbc_dsn_dict(self) -> Dict[str, Any]: params = { "DRIVER": self.driver, "SERVER": f"{self.host},{self.port}", @@ -78,6 +81,10 @@ def to_odbc_dsn(self) -> str: } if self.query is not None: params.update({k.upper(): v for k, v in self.query.items()}) + return params + + def to_odbc_dsn(self) -> str: + params = self._get_odbc_dsn_dict() return ";".join([f"{k}={v}" for k, v in params.items()]) diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index e97389f185..b6af345e36 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -20,6 +20,8 @@ HINT_TO_MSSQL_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} +VARCHAR_MAX_N: int = 4000 +VARBINARY_MAX_N: int = 8000 class MsSqlTypeMapper(TypeMapper): diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index 427518feeb..cd1699adea 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -106,8 +106,8 @@ def drop_dataset(self) -> None: ) table_names = [row[0] for row in rows] self.drop_tables(*table_names) - - self.execute_sql("DROP SCHEMA IF EXISTS %s;" % self.fully_qualified_dataset_name()) + # Drop schema + self._drop_schema() def _drop_views(self, *tables: str) -> None: if not tables: @@ -115,7 +115,10 @@ def _drop_views(self, *tables: str) -> None: statements = [ f"DROP VIEW IF EXISTS {self.make_qualified_table_name(table)};" for table in tables ] - self.execute_fragments(statements) + self.execute_many(statements) + + def _drop_schema(self) -> None: + self.execute_sql("DROP SCHEMA IF EXISTS %s;" % self.fully_qualified_dataset_name()) def execute_sql( self, sql: AnyStr, *args: Any, **kwargs: Any diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index 243cbd6c5b..215d87a920 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -2,6 +2,7 @@ from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.extract import DltResource, resource as make_resource +from dlt.destinations.utils import ensure_resource VECTORIZE_HINT = "x-qdrant-embed" @@ -31,15 +32,7 @@ def qdrant_adapter( >>> qdrant_adapter(data, embed="description") [DltResource with hints applied] """ - # wrap `data` in a resource if not an instance already - resource: DltResource - if not isinstance(data, DltResource): - resource_name: str = None - if not hasattr(data, "__name__"): - resource_name = "content" - resource = make_resource(data, name=resource_name) - else: - resource = data + resource = ensure_resource(data) column_hints: TTableSchemaColumns = {} diff --git a/dlt/destinations/impl/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py index d6bebd3fdd..dde4d5a382 100644 --- a/dlt/destinations/impl/snowflake/__init__.py +++ b/dlt/destinations/impl/snowflake/__init__.py @@ -21,4 +21,5 @@ def capabilities() -> DestinationCapabilitiesContext: caps.is_max_text_data_type_length_in_bytes = True caps.supports_ddl_transactions = True caps.alter_add_multi_column = True + caps.supports_clone_table = True return caps diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 67df78c138..fb51ab9d36 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -27,7 +27,7 @@ from dlt.destinations.impl.snowflake import capabilities from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient -from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams +from dlt.destinations.sql_jobs import SqlJobParams from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase @@ -175,13 +175,15 @@ def __init__( f'PUT file://{file_path} @{stage_name}/"{load_id}" OVERWRITE = TRUE,' " AUTO_COMPRESS = FALSE" ) - client.execute_sql(f"""COPY INTO {qualified_table_name} + client.execute_sql( + f"""COPY INTO {qualified_table_name} {from_clause} {files_clause} {credentials_clause} FILE_FORMAT = {source_format} MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE' - """) + """ + ) if stage_file_path and not keep_staged_files: client.execute_sql(f"REMOVE {stage_file_path}") @@ -192,25 +194,6 @@ def exception(self) -> str: raise NotImplementedError() -class SnowflakeStagingCopyJob(SqlStagingCopyJob): - @classmethod - def generate_sql( - cls, - table_chain: Sequence[TTableSchema], - sql_client: SqlClientBase[Any], - params: Optional[SqlJobParams] = None, - ) -> List[str]: - sql: List[str] = [] - for table in table_chain: - with sql_client.with_staging_dataset(staging=True): - staging_table_name = sql_client.make_qualified_table_name(table["name"]) - table_name = sql_client.make_qualified_table_name(table["name"]) - sql.append(f"DROP TABLE IF EXISTS {table_name};") - # recreate destination table with data cloned from staging table - sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") - return sql - - class SnowflakeClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -250,13 +233,6 @@ def _make_add_column_sql( + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns) ] - def _create_replace_followup_jobs( - self, table_chain: Sequence[TTableSchema] - ) -> List[NewLoadJob]: - if self.config.replace_strategy == "staging-optimized": - return [SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client)] - return super()._create_replace_followup_jobs(table_chain) - def _get_table_update_sql( self, table_name: str, diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py new file mode 100644 index 0000000000..53dbabc090 --- /dev/null +++ b/dlt/destinations/impl/synapse/__init__.py @@ -0,0 +1,48 @@ +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.wei import EVM_DECIMAL_PRECISION + +from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["parquet"] + + caps.insert_values_writer_type = "select_union" # https://stackoverflow.com/a/77014299 + + caps.escape_identifier = escape_postgres_identifier + caps.escape_literal = escape_mssql_literal + + # Synapse has a max precision of 38 + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#DataTypes + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#LimitationsRestrictions + caps.max_identifier_length = 128 + caps.max_column_identifier_length = 128 + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-service-capacity-limits#queries + caps.max_query_length = 65536 * 4096 + caps.is_max_query_length_in_bytes = True + + # nvarchar(max) can store 2 GB + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver16#nvarchar---n--max-- + caps.max_text_data_type_length = 2 * 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-develop-transactions + caps.supports_transactions = True + caps.supports_ddl_transactions = False + + # datetimeoffset can store 7 digits for fractional seconds + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/datetimeoffset-transact-sql?view=sql-server-ver16 + caps.timestamp_precision = 7 + + return caps diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py new file mode 100644 index 0000000000..bb1ba632dc --- /dev/null +++ b/dlt/destinations/impl/synapse/configuration.py @@ -0,0 +1,63 @@ +from typing import Final, Any, List, Dict, Optional, ClassVar + +from dlt.common import logger +from dlt.common.configuration import configspec +from dlt.common.schema.typing import TSchemaTables +from dlt.common.schema.utils import get_write_disposition + +from dlt.destinations.impl.mssql.configuration import ( + MsSqlCredentials, + MsSqlClientConfiguration, +) +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials + +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType + + +@configspec +class SynapseCredentials(MsSqlCredentials): + drivername: Final[str] = "synapse" # type: ignore + + # LongAsMax keyword got introduced in ODBC Driver 18 for SQL Server. + SUPPORTED_DRIVERS: ClassVar[List[str]] = ["ODBC Driver 18 for SQL Server"] + + def _get_odbc_dsn_dict(self) -> Dict[str, Any]: + params = super()._get_odbc_dsn_dict() + # Long types (text, ntext, image) are not supported on Synapse. + # Convert to max types using LongAsMax keyword. + # https://stackoverflow.com/a/57926224 + params["LONGASMAX"] = "yes" + return params + + +@configspec +class SynapseClientConfiguration(MsSqlClientConfiguration): + destination_type: Final[str] = "synapse" # type: ignore + credentials: SynapseCredentials + + # While Synapse uses CLUSTERED COLUMNSTORE INDEX tables by default, we use + # HEAP tables (no indexing) by default. HEAP is a more robust choice, because + # columnstore tables do not support varchar(max), nvarchar(max), and varbinary(max). + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index + default_table_index_type: Optional[TTableIndexType] = "heap" + """ + Table index type that is used if no table index type is specified on the resource. + This only affects data tables, dlt system tables ignore this setting and + are always created as "heap" tables. + """ + + # Set to False by default because the PRIMARY KEY and UNIQUE constraints + # are tricky in Synapse: they are NOT ENFORCED and can lead to innacurate + # results if the user does not ensure all column values are unique. + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints + create_indexes: bool = False + """Whether `primary_key` and `unique` column hints are applied.""" + + staging_use_msi: bool = False + """Whether the managed identity of the Synapse workspace is used to authorize access to the staging Storage Account.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "default_table_index_type", + "create_indexes", + "staging_use_msi", + ] diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py new file mode 100644 index 0000000000..b7eddd6ef7 --- /dev/null +++ b/dlt/destinations/impl/synapse/factory.py @@ -0,0 +1,58 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse.configuration import ( + SynapseCredentials, + SynapseClientConfiguration, +) +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType + +if t.TYPE_CHECKING: + from dlt.destinations.impl.synapse.synapse import SynapseClient + + +class synapse(Destination[SynapseClientConfiguration, "SynapseClient"]): + spec = SynapseClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["SynapseClient"]: + from dlt.destinations.impl.synapse.synapse import SynapseClient + + return SynapseClient + + def __init__( + self, + credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, + default_table_index_type: t.Optional[TTableIndexType] = "heap", + create_indexes: bool = False, + staging_use_msi: bool = False, + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Synapse destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the Synapse dedicated pool. Can be an instance of `SynapseCredentials` or + a connection string in the format `synapse://user:password@host:port/database` + default_table_index_type: Maps directly to the default_table_index_type attribute of the SynapseClientConfiguration object. + create_indexes: Maps directly to the create_indexes attribute of the SynapseClientConfiguration object. + staging_use_msi: Maps directly to the staging_use_msi attribute of the SynapseClientConfiguration object. + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + credentials=credentials, + default_table_index_type=default_table_index_type, + create_indexes=create_indexes, + staging_use_msi=staging_use_msi, + destination_name=destination_name, + environment=environment, + **kwargs, + ) diff --git a/dlt/destinations/impl/synapse/sql_client.py b/dlt/destinations/impl/synapse/sql_client.py new file mode 100644 index 0000000000..089c58e57c --- /dev/null +++ b/dlt/destinations/impl/synapse/sql_client.py @@ -0,0 +1,28 @@ +from typing import ClassVar +from contextlib import suppress + +from dlt.common.destination import DestinationCapabilitiesContext + +from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse.configuration import SynapseCredentials + +from dlt.destinations.exceptions import DatabaseUndefinedRelation + + +class SynapseSqlClient(PyOdbcMsSqlClient): + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def drop_tables(self, *tables: str) -> None: + if not tables: + return + # Synapse does not support DROP TABLE IF EXISTS. + # Workaround: use DROP TABLE and suppress non-existence errors. + statements = [f"DROP TABLE {self.make_qualified_table_name(table)};" for table in tables] + with suppress(DatabaseUndefinedRelation): + self.execute_fragments(statements) + + def _drop_schema(self) -> None: + # Synapse does not support DROP SCHEMA IF EXISTS. + self.execute_sql("DROP SCHEMA %s;" % self.fully_qualified_dataset_name()) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py new file mode 100644 index 0000000000..33e6194602 --- /dev/null +++ b/dlt/destinations/impl/synapse/synapse.py @@ -0,0 +1,297 @@ +import os +from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast +from copy import deepcopy +from textwrap import dedent +from urllib.parse import urlparse, urlunparse + +from dlt import current + +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import ( + SupportsStagingDestination, + NewLoadJob, + CredentialsConfiguration, +) + +from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint +from dlt.common.schema.utils import table_schema_has_type, get_inherited_table_hint +from dlt.common.schema.typing import TTableSchemaColumns + +from dlt.common.configuration.specs import AzureCredentialsWithoutDefaults + +from dlt.destinations.job_impl import NewReferenceJob +from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams +from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.insert_job_client import InsertValuesJobClient +from dlt.destinations.job_client_impl import SqlJobClientBase, LoadJob, CopyRemoteFileLoadJob +from dlt.destinations.exceptions import LoadJobTerminalException + +from dlt.destinations.impl.mssql.mssql import ( + MsSqlTypeMapper, + MsSqlClient, + VARCHAR_MAX_N, + VARBINARY_MAX_N, +) + +from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient +from dlt.destinations.impl.synapse.configuration import SynapseClientConfiguration +from dlt.destinations.impl.synapse.synapse_adapter import ( + TABLE_INDEX_TYPE_HINT, + TTableIndexType, +) + + +HINT_TO_SYNAPSE_ATTR: Dict[TColumnHint, str] = { + "primary_key": "PRIMARY KEY NONCLUSTERED NOT ENFORCED", + "unique": "UNIQUE NOT ENFORCED", +} +TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR: Dict[TTableIndexType, str] = { + "heap": "HEAP", + "clustered_columnstore_index": "CLUSTERED COLUMNSTORE INDEX", +} + + +class SynapseClient(MsSqlClient, SupportsStagingDestination): + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: + super().__init__(schema, config) + self.config: SynapseClientConfiguration = config + self.sql_client = SynapseSqlClient( + config.normalize_dataset_name(schema), config.credentials + ) + + self.active_hints = deepcopy(HINT_TO_SYNAPSE_ATTR) + if not self.config.create_indexes: + self.active_hints.pop("primary_key", None) + self.active_hints.pop("unique", None) + + def _get_table_update_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> List[str]: + table = self.get_load_table(table_name, staging=self.in_staging_mode) + if table is None: + table_index_type = self.config.default_table_index_type + else: + table_index_type = cast(TTableIndexType, table.get(TABLE_INDEX_TYPE_HINT)) + if self.in_staging_mode: + final_table = self.get_load_table(table_name, staging=False) + final_table_index_type = cast( + TTableIndexType, final_table.get(TABLE_INDEX_TYPE_HINT) + ) + else: + final_table_index_type = table_index_type + if final_table_index_type == "clustered_columnstore_index": + # Even if the staging table has index type "heap", we still adjust + # the column data types to prevent errors when writing into the + # final table that has index type "clustered_columnstore_index". + new_columns = self._get_columstore_valid_columns(new_columns) + + _sql_result = SqlJobClientBase._get_table_update_sql( + self, table_name, new_columns, generate_alter + ) + if not generate_alter: + table_index_type_attr = TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR[table_index_type] + sql_result = [_sql_result[0] + f"\n WITH ( {table_index_type_attr} );"] + else: + sql_result = _sql_result + return sql_result + + def _get_columstore_valid_columns( + self, columns: Sequence[TColumnSchema] + ) -> Sequence[TColumnSchema]: + return [self._get_columstore_valid_column(c) for c in columns] + + def _get_columstore_valid_column(self, c: TColumnSchema) -> TColumnSchema: + """ + Returns TColumnSchema that maps to a Synapse data type that can participate in a columnstore index. + + varchar(max), nvarchar(max), and varbinary(max) are replaced with + varchar(n), nvarchar(n), and varbinary(n), respectively, where + n equals the user-specified precision, or the maximum allowed + value if the user did not specify a precision. + """ + varchar_source_types = [ + sct + for sct, dbt in MsSqlTypeMapper.sct_to_unbound_dbt.items() + if dbt in ("varchar(max)", "nvarchar(max)") + ] + varbinary_source_types = [ + sct + for sct, dbt in MsSqlTypeMapper.sct_to_unbound_dbt.items() + if dbt == "varbinary(max)" + ] + if c["data_type"] in varchar_source_types and "precision" not in c: + return {**c, **{"precision": VARCHAR_MAX_N}} + elif c["data_type"] in varbinary_source_types and "precision" not in c: + return {**c, **{"precision": VARBINARY_MAX_N}} + return c + + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [SynapseStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) + + def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: + table = super().get_load_table(table_name, staging) + if table is None: + return None + if staging and self.config.replace_strategy == "insert-from-staging": + # Staging tables should always be heap tables, because "when you are + # temporarily landing data in dedicated SQL pool, you may find that + # using a heap table makes the overall process faster." + # "staging-optimized" is not included, because in that strategy the + # staging table becomes the final table, so we should already create + # it with the desired index type. + table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key] + elif table_name in self.schema.dlt_table_names(): + # dlt tables should always be heap tables, because "for small lookup + # tables, less than 60 million rows, consider using HEAP or clustered + # index for faster query performance." + table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key] + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables + elif table_name in self.schema.data_table_names(): + if TABLE_INDEX_TYPE_HINT not in table: + # If present in parent table, fetch hint from there. + table[TABLE_INDEX_TYPE_HINT] = get_inherited_table_hint( # type: ignore[typeddict-unknown-key] + self.schema.tables, table_name, TABLE_INDEX_TYPE_HINT, allow_none=True + ) + if table[TABLE_INDEX_TYPE_HINT] is None: # type: ignore[typeddict-item] + # Hint still not defined, fall back to default. + table[TABLE_INDEX_TYPE_HINT] = self.config.default_table_index_type # type: ignore[typeddict-unknown-key] + return table + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + job = super().start_file_load(table, file_path, load_id) + if not job: + assert NewReferenceJob.is_reference_job( + file_path + ), "Synapse must use staging to load files" + job = SynapseCopyFileLoadJob( + table, + file_path, + self.sql_client, + cast(AzureCredentialsWithoutDefaults, self.config.staging_config.credentials), + self.config.staging_use_msi, + ) + return job + + +class SynapseStagingCopyJob(SqlStagingCopyJob): + @classmethod + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: + sql: List[str] = [] + for table in table_chain: + with sql_client.with_staging_dataset(staging=True): + staging_table_name = sql_client.make_qualified_table_name(table["name"]) + table_name = sql_client.make_qualified_table_name(table["name"]) + # drop destination table + sql.append(f"DROP TABLE {table_name};") + # moving staging table to destination schema + sql.append( + f"ALTER SCHEMA {sql_client.fully_qualified_dataset_name()} TRANSFER" + f" {staging_table_name};" + ) + # recreate staging table + job_client = current.pipeline().destination_client() # type: ignore[operator] + with job_client.with_staging_dataset(): + # get table columns from schema + columns = [c for c in job_client.schema.get_table_columns(table["name"]).values()] + # generate CREATE TABLE statement + create_table_stmt = job_client._get_table_update_sql( + table["name"], columns, generate_alter=False + ) + sql.extend(create_table_stmt) + + return sql + + +class SynapseCopyFileLoadJob(CopyRemoteFileLoadJob): + def __init__( + self, + table: TTableSchema, + file_path: str, + sql_client: SqlClientBase[Any], + staging_credentials: Optional[AzureCredentialsWithoutDefaults] = None, + staging_use_msi: bool = False, + ) -> None: + self.staging_use_msi = staging_use_msi + super().__init__(table, file_path, sql_client, staging_credentials) + + def execute(self, table: TTableSchema, bucket_path: str) -> None: + # get format + ext = os.path.splitext(bucket_path)[1][1:] + if ext == "parquet": + if table_schema_has_type(table, "time"): + # Synapse interprets Parquet TIME columns as bigint, resulting in + # an incompatibility error. + raise LoadJobTerminalException( + self.file_name(), + "Synapse cannot load TIME columns from Parquet files. Switch to direct INSERT" + " file format or convert `datetime.time` objects in your data to `str` or" + " `datetime.datetime`", + ) + file_type = "PARQUET" + + # dlt-generated DDL statements will still create the table, but + # enabling AUTO_CREATE_TABLE prevents a MalformedInputException. + auto_create_table = "ON" + else: + raise ValueError(f"Unsupported file type {ext} for Synapse.") + + staging_credentials = self._staging_credentials + assert staging_credentials is not None + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + azure_storage_account_name = staging_credentials.azure_storage_account_name + https_path = self._get_https_path(bucket_path, azure_storage_account_name) + table_name = table["name"] + + if self.staging_use_msi: + credential = "IDENTITY = 'Managed Identity'" + else: + sas_token = staging_credentials.azure_storage_sas_token + credential = f"IDENTITY = 'Shared Access Signature', SECRET = '{sas_token}'" + + # Copy data from staging file into Synapse table. + with self._sql_client.begin_transaction(): + dataset_name = self._sql_client.dataset_name + sql = dedent(f""" + COPY INTO [{dataset_name}].[{table_name}] + FROM '{https_path}' + WITH ( + FILE_TYPE = '{file_type}', + CREDENTIAL = ({credential}), + AUTO_CREATE_TABLE = '{auto_create_table}' + ) + """) + self._sql_client.execute_sql(sql) + + def exception(self) -> str: + # this part of code should be never reached + raise NotImplementedError() + + def _get_https_path(self, bucket_path: str, storage_account_name: str) -> str: + """ + Converts a path in the form of az:/// to + https://.blob.core.windows.net// + as required by Synapse. + """ + bucket_url = urlparse(bucket_path) + # "blob" endpoint has better performance than "dfs" endoint + # https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql?view=azure-sqldw-latest#external-locations + endpoint = "blob" + _path = "/" + bucket_url.netloc + bucket_url.path + https_url = bucket_url._replace( + scheme="https", + netloc=f"{storage_account_name}.{endpoint}.core.windows.net", + path=_path, + ) + return urlunparse(https_url) diff --git a/dlt/destinations/impl/synapse/synapse_adapter.py b/dlt/destinations/impl/synapse/synapse_adapter.py new file mode 100644 index 0000000000..24932736f9 --- /dev/null +++ b/dlt/destinations/impl/synapse/synapse_adapter.py @@ -0,0 +1,52 @@ +from typing import Any, Literal, Set, get_args, Final, Dict + +from dlt.extract import DltResource, resource as make_resource +from dlt.extract.typing import TTableHintTemplate +from dlt.extract.hints import TResourceHints +from dlt.destinations.utils import ensure_resource + +TTableIndexType = Literal["heap", "clustered_columnstore_index"] +""" +Table [index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) used when creating the Synapse table. +This regards indexes specified at the table level, not the column level. +""" +TABLE_INDEX_TYPES: Set[TTableIndexType] = set(get_args(TTableIndexType)) + +TABLE_INDEX_TYPE_HINT: Literal["x-table-index-type"] = "x-table-index-type" + + +def synapse_adapter(data: Any, table_index_type: TTableIndexType = None) -> DltResource: + """Prepares data for the Synapse destination by specifying which table index + type should be used. + + Args: + data (Any): The data to be transformed. It can be raw data or an instance + of DltResource. If raw data, the function wraps it into a DltResource + object. + table_index_type (TTableIndexType, optional): The table index type used when creating + the Synapse table. + + Returns: + DltResource: A resource with applied Synapse-specific hints. + + Raises: + ValueError: If input for `table_index_type` is invalid. + + Examples: + >>> data = [{"name": "Anush", "description": "Integrations Hacker"}] + >>> synapse_adapter(data, table_index_type="clustered_columnstore_index") + [DltResource with hints applied] + """ + resource = ensure_resource(data) + + additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} + if table_index_type is not None: + if table_index_type not in TABLE_INDEX_TYPES: + allowed_types = ", ".join(TABLE_INDEX_TYPES) + raise ValueError( + f"Table index type {table_index_type} is invalid. Allowed table index" + f" types are: {allowed_types}." + ) + additional_table_hints[TABLE_INDEX_TYPE_HINT] = table_index_type + resource.apply_hints(additional_table_hints=additional_table_hints) + return resource diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index 2d5161d9e9..a290ac65b4 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -2,6 +2,7 @@ from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.extract import DltResource, resource as make_resource +from dlt.destinations.utils import ensure_resource TTokenizationTMethod = Literal["word", "lowercase", "whitespace", "field"] TOKENIZATION_METHODS: Set[TTokenizationTMethod] = set(get_args(TTokenizationTMethod)) @@ -53,15 +54,7 @@ def weaviate_adapter( >>> weaviate_adapter(data, vectorize="description", tokenization={"description": "word"}) [DltResource with hints applied] """ - # wrap `data` in a resource if not an instance already - resource: DltResource - if not isinstance(data, DltResource): - resource_name: str = None - if not hasattr(data, "__name__"): - resource_name = "content" - resource = make_resource(data, name=resource_name) - else: - resource = data + resource = ensure_resource(data) column_hints: TTableSchemaColumns = {} if vectorize: diff --git a/dlt/destinations/insert_job_client.py b/dlt/destinations/insert_job_client.py index 678ba43bcc..776176078e 100644 --- a/dlt/destinations/insert_job_client.py +++ b/dlt/destinations/insert_job_client.py @@ -36,9 +36,10 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # the procedure below will split the inserts into max_query_length // 2 packs with FileStorage.open_zipsafe_ro(file_path, "r", encoding="utf-8") as f: header = f.readline() - values_mark = f.readline() - # properly formatted file has a values marker at the beginning - assert values_mark == "VALUES\n" + if self._sql_client.capabilities.insert_values_writer_type == "default": + # properly formatted file has a values marker at the beginning + values_mark = f.readline() + assert values_mark == "VALUES\n" max_rows = self._sql_client.capabilities.max_rows_per_insert @@ -67,7 +68,9 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # Chunk by max_rows - 1 for simplicity because one more row may be added for chunk in chunks(values_rows, max_rows - 1): processed += len(chunk) - insert_sql.extend([header.format(qualified_table_name), values_mark]) + insert_sql.append(header.format(qualified_table_name)) + if self._sql_client.capabilities.insert_values_writer_type == "default": + insert_sql.append(values_mark) if processed == len_rows: # On the last chunk we need to add the extra row read insert_sql.append("".join(chunk) + until_nl) @@ -76,7 +79,12 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st insert_sql.append("".join(chunk).strip()[:-1] + ";\n") else: # otherwise write all content in a single INSERT INTO - insert_sql.extend([header.format(qualified_table_name), values_mark, content]) + if self._sql_client.capabilities.insert_values_writer_type == "default": + insert_sql.extend( + [header.format(qualified_table_name), values_mark, content] + ) + elif self._sql_client.capabilities.insert_values_writer_type == "select_union": + insert_sql.extend([header.format(qualified_table_name), content]) if until_nl: insert_sql.append(until_nl) diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index ac68cfea8a..e7dc4bcbe2 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -50,7 +50,6 @@ FollowupJob, CredentialsConfiguration, ) -from dlt.common.utils import concat_strings_with_limit from dlt.destinations.exceptions import ( DatabaseUndefinedRelation, DestinationSchemaTampered, @@ -76,15 +75,19 @@ def __init__(self, file_path: str, sql_client: SqlClientBase[Any]) -> None: with FileStorage.open_zipsafe_ro(file_path, "r", encoding="utf-8") as f: sql = f.read() + # Some clients (e.g. databricks) do not support multiple statements in one execute call + if not sql_client.capabilities.supports_multiple_statements: + sql_client.execute_many(self._split_fragments(sql)) # if we detect ddl transactions, only execute transaction if supported by client - if ( + elif ( not self._string_containts_ddl_queries(sql) or sql_client.capabilities.supports_ddl_transactions ): # with sql_client.begin_transaction(): sql_client.execute_sql(sql) else: - sql_client.execute_sql(sql) + # sql_client.execute_sql(sql) + sql_client.execute_many(self._split_fragments(sql)) def state(self) -> TLoadJobState: # this job is always done @@ -100,6 +103,9 @@ def _string_containts_ddl_queries(self, sql: str) -> bool: return True return False + def _split_fragments(self, sql: str) -> List[str]: + return [s + (";" if not s.endswith(";") else "") for s in sql.split(";") if s.strip()] + @staticmethod def is_sql_job(file_path: str) -> bool: return os.path.splitext(file_path)[1][1:] == "sql" @@ -295,6 +301,15 @@ def __exit__( ) -> None: self.sql_client.close_connection() + def _get_storage_table_query_columns(self) -> List[str]: + """Column names used when querying table from information schema. + Override for databases that use different namings. + """ + fields = ["column_name", "data_type", "is_nullable"] + if self.capabilities.schema_supports_numeric_precision: + fields += ["numeric_precision", "numeric_scale"] + return fields + def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: def _null_to_bool(v: str) -> bool: if v == "NO": @@ -303,9 +318,7 @@ def _null_to_bool(v: str) -> bool: return True raise ValueError(v) - fields = ["column_name", "data_type", "is_nullable"] - if self.capabilities.schema_supports_numeric_precision: - fields += ["numeric_precision", "numeric_scale"] + fields = self._get_storage_table_query_columns() db_params = self.sql_client.make_qualified_table_name(table_name, escape=False).split( ".", 3 ) @@ -383,10 +396,7 @@ def _execute_schema_update_sql(self, only_tables: Iterable[str]) -> TSchemaTable sql_scripts, schema_update = self._build_schema_update_sql(only_tables) # stay within max query size when doing DDL. some db backends use bytes not characters so decrease limit by half # assuming that most of the characters in DDL encode into single bytes - for sql_fragment in concat_strings_with_limit( - sql_scripts, "\n", self.capabilities.max_query_length // 2 - ): - self.sql_client.execute_sql(sql_fragment) + self.sql_client.execute_many(sql_scripts) self._update_schema_in_storage(self.schema) return schema_update diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 1e5f7031a5..695f1a0972 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -19,8 +19,13 @@ from dlt.common.typing import TFun from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.utils import concat_strings_with_limit -from dlt.destinations.exceptions import DestinationConnectionError, LoadClientNotConnected +from dlt.destinations.exceptions import ( + DestinationConnectionError, + LoadClientNotConnected, + DatabaseTerminalException, +) from dlt.destinations.typing import DBApi, TNativeConn, DBApiCursor, DataFrame, DBTransaction @@ -86,7 +91,7 @@ def drop_dataset(self) -> None: def truncate_tables(self, *tables: str) -> None: statements = [self._truncate_table_sql(self.make_qualified_table_name(t)) for t in tables] - self.execute_fragments(statements) + self.execute_many(statements) def drop_tables(self, *tables: str) -> None: if not tables: @@ -94,7 +99,7 @@ def drop_tables(self, *tables: str) -> None: statements = [ f"DROP TABLE IF EXISTS {self.make_qualified_table_name(table)};" for table in tables ] - self.execute_fragments(statements) + self.execute_many(statements) @abstractmethod def execute_sql( @@ -114,6 +119,25 @@ def execute_fragments( """Executes several SQL fragments as efficiently as possible to prevent data copying. Default implementation just joins the strings and executes them together.""" return self.execute_sql("".join(fragments), *args, **kwargs) # type: ignore + def execute_many( + self, statements: Sequence[str], *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: + """Executes multiple SQL statements as efficiently as possible. When client supports multiple statements in a single query + they are executed together in as few database calls as possible. + """ + ret = [] + if self.capabilities.supports_multiple_statements: + for sql_fragment in concat_strings_with_limit( + list(statements), "\n", self.capabilities.max_query_length // 2 + ): + ret.append(self.execute_sql(sql_fragment, *args, **kwargs)) + else: + for statement in statements: + result = self.execute_sql(statement, *args, **kwargs) + if result is not None: + ret.append(result) + return ret + @abstractmethod def fully_qualified_dataset_name(self, escape: bool = True) -> str: pass diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index d97a098669..d0911d0bea 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -74,11 +74,28 @@ class SqlStagingCopyJob(SqlBaseJob): failed_text: str = "Tried to generate a staging copy sql job for the following tables:" @classmethod - def generate_sql( + def _generate_clone_sql( cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], - params: Optional[SqlJobParams] = None, + ) -> List[str]: + """Drop and clone the table for supported destinations""" + sql: List[str] = [] + for table in table_chain: + with sql_client.with_staging_dataset(staging=True): + staging_table_name = sql_client.make_qualified_table_name(table["name"]) + table_name = sql_client.make_qualified_table_name(table["name"]) + sql.append(f"DROP TABLE IF EXISTS {table_name};") + # recreate destination table with data cloned from staging table + sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") + return sql + + @classmethod + def _generate_insert_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: SqlJobParams = None, ) -> List[str]: sql: List[str] = [] for table in table_chain: @@ -98,6 +115,17 @@ def generate_sql( ) return sql + @classmethod + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: SqlJobParams = None, + ) -> List[str]: + if params["replace"] and sql_client.capabilities.supports_clone_table: + return cls._generate_clone_sql(table_chain, sql_client) + return cls._generate_insert_sql(table_chain, sql_client, params) + class SqlMergeJob(SqlBaseJob): """Generates a list of sql statements that merge the data from staging dataset into destination dataset.""" @@ -186,6 +214,21 @@ def gen_insert_temp_table_sql( """ return [cls._to_temp_table(select_statement, temp_table_name)], temp_table_name + @classmethod + def gen_delete_from_sql( + cls, + table_name: str, + unique_column: str, + delete_temp_table_name: str, + temp_table_column: str, + ) -> str: + """Generate DELETE FROM statement deleting the records found in the deletes temp table.""" + return f"""DELETE FROM {table_name} + WHERE {unique_column} IN ( + SELECT * FROM {delete_temp_table_name} + ); + """ + @classmethod def _new_temp_table_name(cls, name_prefix: str) -> str: return f"{name_prefix}_{uniq_id()}" @@ -261,12 +304,9 @@ def gen_merge_sql( unique_column, key_table_clauses ) sql.extend(create_delete_temp_table_sql) - # delete top table - sql.append( - f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM" - f" {delete_temp_table_name});" - ) - # delete other tables + + # delete from child tables first. This is important for databricks which does not support temporary tables, + # but uses temporary views instead for table in table_chain[1:]: table_name = sql_client.make_qualified_table_name(table["name"]) root_key_columns = get_columns_names_with_prop(table, "root_key") @@ -281,15 +321,25 @@ def gen_merge_sql( ) root_key_column = sql_client.capabilities.escape_identifier(root_key_columns[0]) sql.append( - f"DELETE FROM {table_name} WHERE {root_key_column} IN (SELECT * FROM" - f" {delete_temp_table_name});" + cls.gen_delete_from_sql( + table_name, root_key_column, delete_temp_table_name, unique_column + ) + ) + + # delete from top table now that child tables have been prcessed + sql.append( + cls.gen_delete_from_sql( + root_table_name, unique_column, delete_temp_table_name, unique_column ) + ) + # create temp table used to deduplicate, only when we have primary keys if primary_keys: - create_insert_temp_table_sql, insert_temp_table_name = ( - cls.gen_insert_temp_table_sql( - staging_root_table_name, primary_keys, unique_column - ) + ( + create_insert_temp_table_sql, + insert_temp_table_name, + ) = cls.gen_insert_temp_table_sql( + staging_root_table_name, primary_keys, unique_column ) sql.extend(create_insert_temp_table_sql) diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py new file mode 100644 index 0000000000..d4b945a840 --- /dev/null +++ b/dlt/destinations/utils.py @@ -0,0 +1,16 @@ +from typing import Any + +from dlt.extract import DltResource, resource as make_resource + + +def ensure_resource(data: Any) -> DltResource: + """Wraps `data` in a DltResource if it's not a DltResource already.""" + resource: DltResource + if not isinstance(data, DltResource): + resource_name: str = None + if not hasattr(data, "__name__"): + resource_name = "content" + resource = make_resource(data, name=resource_name) + else: + resource = data + return resource diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index 9dcffdacb9..78e246cd46 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -1,4 +1,5 @@ -from dlt.extract.resource import DltResource, with_table_name +from dlt.extract.resource import DltResource, with_table_name, with_hints +from dlt.extract.hints import make_hints from dlt.extract.source import DltSource from dlt.extract.decorators import source, resource, transformer, defer from dlt.extract.incremental import Incremental @@ -8,6 +9,8 @@ "DltResource", "DltSource", "with_table_name", + "with_hints", + "make_hints", "source", "resource", "transformer", diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index cf7426e683..d86fd04ef4 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -37,6 +37,7 @@ TSchemaContract, TTableFormat, ) +from dlt.extract.hints import make_hints from dlt.extract.utils import ( ensure_table_schema_columns_hint, simulate_func_call, @@ -48,6 +49,7 @@ from dlt.common.typing import AnyFun, ParamSpec, Concatenate, TDataItem, TDataItems from dlt.common.utils import get_callable_name, get_module_name, is_inner_callable from dlt.extract.exceptions import ( + CurrentSourceNotAvailable, DynamicNameNotStandaloneResource, InvalidTransformerDataTypeGeneratorFunctionRequired, ResourceFunctionExpected, @@ -56,7 +58,7 @@ SourceIsAClassTypeError, ExplicitSourceNameInvalid, SourceNotAFunction, - SourceSchemaNotAvailable, + CurrentSourceSchemaNotAvailable, ) from dlt.extract.incremental import IncrementalResourceWrapper @@ -67,7 +69,7 @@ @configspec class SourceSchemaInjectableContext(ContainerInjectableContext): - """A context containing the source schema, present when decorated function is executed""" + """A context containing the source schema, present when dlt.source/resource decorated function is executed""" schema: Schema @@ -78,6 +80,19 @@ class SourceSchemaInjectableContext(ContainerInjectableContext): def __init__(self, schema: Schema = None) -> None: ... +@configspec +class SourceInjectableContext(ContainerInjectableContext): + """A context containing the source schema, present when dlt.resource decorated function is executed""" + + source: DltSource + + can_create_default: ClassVar[bool] = False + + if TYPE_CHECKING: + + def __init__(self, source: DltSource = None) -> None: ... + + TSourceFunParams = ParamSpec("TSourceFunParams") TResourceFunParams = ParamSpec("TResourceFunParams") TDltSourceImpl = TypeVar("TDltSourceImpl", bound=DltSource, default=DltSource) @@ -395,7 +410,7 @@ def resource( def make_resource( _name: str, _section: str, _data: Any, incremental: IncrementalResourceWrapper = None ) -> DltResource: - table_template = DltResource.new_table_template( + table_template = make_hints( table_name, write_disposition=write_disposition, columns=columns, @@ -694,7 +709,15 @@ def get_source_schema() -> Schema: try: return Container()[SourceSchemaInjectableContext].schema except ContextDefaultCannotBeCreated: - raise SourceSchemaNotAvailable() + raise CurrentSourceSchemaNotAvailable() + + +def get_source() -> DltSource: + """When executed from the function decorated with @dlt.resource, returns currently extracted source""" + try: + return Container()[SourceInjectableContext].source + except ContextDefaultCannotBeCreated: + raise CurrentSourceNotAvailable() TBoundItems = TypeVar("TBoundItems", bound=TDataItems) diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index 8e7d0dddf8..de785865c5 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -88,7 +88,7 @@ def __init__(self, pipe_name: str, gen: Any) -> None: " extract method." ) msg += ( - "dlt will evaluate functions that were passed as data argument. If you passed a" + " dlt will evaluate functions that were passed as data argument. If you passed a" " function the returned data type is not iterable. " ) type_name = str(type(gen)) @@ -377,7 +377,7 @@ def __init__(self, source_name: str, _typ: Type[Any]) -> None: ) -class SourceSchemaNotAvailable(DltSourceException): +class CurrentSourceSchemaNotAvailable(DltSourceException): def __init__(self) -> None: super().__init__( "Current source schema is available only when called from a function decorated with" @@ -385,6 +385,14 @@ def __init__(self) -> None: ) +class CurrentSourceNotAvailable(DltSourceException): + def __init__(self) -> None: + super().__init__( + "Current source is available only when called from a function decorated with" + " dlt.resource or dlt.transformer during the extract step" + ) + + class ExplicitSourceNameInvalid(DltSourceException): def __init__(self, source_name: str, schema_name: str) -> None: self.source_name = source_name diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 9ff3cf872c..c1ff5da80b 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -31,7 +31,7 @@ from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import get_callable_name, get_full_class_name -from dlt.extract.decorators import SourceSchemaInjectableContext +from dlt.extract.decorators import SourceInjectableContext, SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints from dlt.extract.pipe import PipeIterator from dlt.extract.source import DltSource @@ -322,7 +322,9 @@ def extract( ) -> str: # generate load package to be able to commit all the sources together later load_id = self.extract_storage.create_load_package(source.discover_schema()) - with Container().injectable_context(SourceSchemaInjectableContext(source.schema)): + with Container().injectable_context( + SourceSchemaInjectableContext(source.schema) + ), Container().injectable_context(SourceInjectableContext(source)): # inject the config section with the current source name with inject_section( ConfigSectionContext( diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index bc32893677..f6c3fde5d4 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -18,7 +18,7 @@ TTableSchemaColumns, TPartialTableSchema, ) - +from dlt.extract.hints import HintsMeta from dlt.extract.resource import DltResource from dlt.extract.typing import TableNameMeta from dlt.extract.storage import ExtractStorage, ExtractorItemStorage @@ -85,6 +85,12 @@ def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" + if isinstance(meta, HintsMeta): + # update the resource with new hints, remove all caches so schema is recomputed + # and contracts re-applied + resource.merge_hints(meta.hints) + self._reset_contracts_cache() + if table_name := self._get_static_table_name(resource, meta): # write item belonging to table with static name self._write_to_static_table(resource, table_name, items) @@ -152,7 +158,7 @@ def _compute_and_update_table( self, resource: DltResource, table_name: str, items: TDataItems ) -> TDataItems: """ - Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written + Computes new table and does contract checks, if false is returned, the table may not be created and no items should be written """ computed_table = self._compute_table(resource, items) # overwrite table name (if coming from meta) @@ -190,6 +196,12 @@ def _compute_and_update_table( filtered_columns[name] = mode return items + def _reset_contracts_cache(self) -> None: + """Removes all cached contracts, filtered columns and tables""" + self._table_contracts.clear() + self._filtered_tables.clear() + self._filtered_columns.clear() + class JsonLExtractor(Extractor): file_format = "puae-jsonl" diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 437dbbc6bd..ec4bd56021 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -1,11 +1,10 @@ from copy import copy, deepcopy -from typing import List, TypedDict, cast, Any +from typing import List, TypedDict, cast, Any, Optional, Dict from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table from dlt.common.schema.typing import ( TColumnNames, TColumnProp, - TColumnSchema, TPartialTableSchema, TTableSchema, TTableSchemaColumns, @@ -23,7 +22,6 @@ from dlt.extract.exceptions import ( DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, - TableNameMissing, ) from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator @@ -40,10 +38,66 @@ class TResourceHints(TypedDict, total=False): merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] schema_contract: TTableHintTemplate[TSchemaContract] + table_format: TTableHintTemplate[TTableFormat] validator: ValidateItem original_columns: TTableHintTemplate[TAnySchemaColumns] +class HintsMeta: + __slots__ = "hints" + + hints: TResourceHints + + def __init__(self, hints: TResourceHints) -> None: + self.hints = hints + + +def make_hints( + table_name: TTableHintTemplate[str] = None, + parent_table_name: TTableHintTemplate[str] = None, + write_disposition: TTableHintTemplate[TWriteDisposition] = None, + columns: TTableHintTemplate[TAnySchemaColumns] = None, + primary_key: TTableHintTemplate[TColumnNames] = None, + merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, +) -> TResourceHints: + """A convenience function to create resource hints. Accepts both static and dynamic hints based on data. + + This method accepts the same table hints arguments as `dlt.resource` decorator. + """ + validator, schema_contract = create_item_validator(columns, schema_contract) + clean_columns = columns + if columns is not None: + clean_columns = ensure_table_schema_columns_hint(columns) + if not callable(clean_columns): + clean_columns = clean_columns.values() # type: ignore + # create a table schema template where hints can be functions taking TDataItem + new_template: TResourceHints = new_table( + table_name, # type: ignore + parent_table_name, # type: ignore + write_disposition=write_disposition, # type: ignore + columns=clean_columns, # type: ignore + schema_contract=schema_contract, # type: ignore + table_format=table_format, # type: ignore + ) + if not table_name: + new_template.pop("name") + # remember original columns + if columns is not None: + new_template["original_columns"] = columns + # always remove resource + new_template.pop("resource", None) # type: ignore + if primary_key is not None: + new_template["primary_key"] = primary_key + if merge_key is not None: + new_template["merge_key"] = merge_key + if validator: + new_template["validator"] = validator + DltResourceHints.validate_dynamic_hints(new_template) + return new_template + + class DltResourceHints: def __init__(self, table_schema_template: TResourceHints = None): self.__qualname__ = self.__name__ = self.name @@ -105,7 +159,11 @@ def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: if self._table_name_hint_fun and item is None: raise DataItemRequiredForDynamicTableHints(self.name) # resolve - resolved_template: TResourceHints = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator", "original_columns"]} # type: ignore + resolved_template: TResourceHints = { + k: self._resolve_hint(item, v) + for k, v in table_template.items() + if k not in ["incremental", "validator", "original_columns"] + } # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name validate_dict_ignoring_xkeys( @@ -125,6 +183,8 @@ def apply_hints( merge_key: TTableHintTemplate[TColumnNames] = None, incremental: Incremental[Any] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, + additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, + table_format: TTableHintTemplate[TTableFormat] = None, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -141,7 +201,7 @@ def apply_hints( t = None if not self._hints: # if there's no template yet, create and set new one - t = self.new_table_template( + t = make_hints( table_name, parent_table_name, write_disposition, @@ -149,6 +209,7 @@ def apply_hints( primary_key, merge_key, schema_contract, + table_format, ) else: # set single hints @@ -201,13 +262,26 @@ def apply_hints( ) if schema_contract is not None: t["schema_contract"] = schema_contract + if table_format is not None: + if table_format: + t["table_format"] = table_format + else: + t.pop("table_format", None) - # set properties that cannot be passed to new_table_template + # set properties that cannot be passed to make_hints if incremental is not None: if incremental is Incremental.EMPTY: t["incremental"] = None else: t["incremental"] = incremental + if additional_table_hints is not None: + # loop through provided hints and add, overwrite, or remove them + for k, v in additional_table_hints.items(): + if v is not None: + t[k] = v # type: ignore[literal-required] + else: + t.pop(k, None) # type: ignore[misc] + self.set_hints(t) def set_hints(self, hints_template: TResourceHints) -> None: @@ -224,6 +298,19 @@ def set_hints(self, hints_template: TResourceHints) -> None: ) self._hints = hints_template + def merge_hints(self, hints_template: TResourceHints) -> None: + self.apply_hints( + table_name=hints_template.get("name"), + parent_table_name=hints_template.get("parent"), + write_disposition=hints_template.get("write_disposition"), + columns=hints_template.get("original_columns"), + primary_key=hints_template.get("primary_key"), + merge_key=hints_template.get("merge_key"), + incremental=hints_template.get("incremental"), + schema_contract=hints_template.get("schema_contract"), + table_format=hints_template.get("table_format"), + ) + @staticmethod def _clone_hints(hints_template: TResourceHints) -> TResourceHints: t_ = copy(hints_template) @@ -264,48 +351,6 @@ def _merge_keys(t_: TResourceHints) -> TPartialTableSchema: return partial - @staticmethod - def new_table_template( - table_name: TTableHintTemplate[str], - parent_table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, - columns: TTableHintTemplate[TAnySchemaColumns] = None, - primary_key: TTableHintTemplate[TColumnNames] = None, - merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract: TTableHintTemplate[TSchemaContract] = None, - table_format: TTableHintTemplate[TTableFormat] = None, - ) -> TResourceHints: - validator, schema_contract = create_item_validator(columns, schema_contract) - clean_columns = columns - if columns is not None: - clean_columns = ensure_table_schema_columns_hint(columns) - if not callable(clean_columns): - clean_columns = clean_columns.values() # type: ignore - # create a table schema template where hints can be functions taking TDataItem - new_template: TResourceHints = new_table( - table_name, # type: ignore - parent_table_name, # type: ignore - write_disposition=write_disposition, # type: ignore - columns=clean_columns, # type: ignore - schema_contract=schema_contract, # type: ignore - table_format=table_format, # type: ignore - ) - if not table_name: - new_template.pop("name") - # remember original columns - if columns is not None: - new_template["original_columns"] = columns - # always remove resource - new_template.pop("resource", None) # type: ignore - if primary_key: - new_template["primary_key"] = primary_key - if merge_key: - new_template["merge_key"] = merge_key - if validator: - new_template["validator"] = validator - DltResourceHints.validate_dynamic_hints(new_template) - return new_template - @staticmethod def validate_dynamic_hints(template: TResourceHints) -> None: table_name = template.get("name") diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py index 6f02f882bc..3062ed083d 100644 --- a/dlt/extract/pipe.py +++ b/dlt/extract/pipe.py @@ -55,6 +55,7 @@ simulate_func_call, wrap_compat_transformer, wrap_resource_gen, + wrap_async_iterator, ) if TYPE_CHECKING: @@ -108,7 +109,7 @@ class SourcePipeItem(NamedTuple): Callable[[TDataItems], Iterator[ResolvablePipeItem]], ] -TPipeNextItemMode = Union[Literal["fifo"], Literal["round_robin"]] +TPipeNextItemMode = Literal["fifo", "round_robin"] class ForkPipe: @@ -321,6 +322,10 @@ def evaluate_gen(self) -> None: # verify if transformer can be called self._ensure_transform_step(self._gen_idx, gen) + # wrap async generator + if isinstance(self.gen, AsyncIterator): + self.replace_gen(wrap_async_iterator(self.gen)) + # evaluate transforms for step_no, step in enumerate(self._steps): # print(f"pipe {self.name} step no {step_no} step({step})") @@ -366,9 +371,10 @@ def _wrap_gen(self, *args: Any, **kwargs: Any) -> Any: def _verify_head_step(self, step: TPipeStep) -> None: # first element must be Iterable, Iterator or Callable in resource pipe - if not isinstance(step, (Iterable, Iterator)) and not callable(step): + if not isinstance(step, (Iterable, Iterator, AsyncIterator)) and not callable(step): raise CreatePipeException( - self.name, "A head of a resource pipe must be Iterable, Iterator or a Callable" + self.name, + "A head of a resource pipe must be Iterable, Iterator, AsyncIterator or a Callable", ) def _wrap_transform_step_meta(self, step_no: int, step: TPipeStep) -> TPipeStep: @@ -498,20 +504,20 @@ def __init__( max_parallel_items: int, workers: int, futures_poll_interval: float, + sources: List[SourcePipeItem], next_item_mode: TPipeNextItemMode, ) -> None: self.max_parallel_items = max_parallel_items self.workers = workers self.futures_poll_interval = futures_poll_interval - - self._round_robin_index: int = -1 - self._initial_sources_count: int = 0 self._async_pool: asyncio.AbstractEventLoop = None self._async_pool_thread: Thread = None self._thread_pool: ThreadPoolExecutor = None - self._sources: List[SourcePipeItem] = [] + self._sources = sources self._futures: List[FuturePipeItem] = [] - self._next_item_mode = next_item_mode + self._next_item_mode: TPipeNextItemMode = next_item_mode + self._initial_sources_count = len(sources) + self._current_source_index: int = 0 @classmethod @with_config(spec=PipeIteratorConfiguration) @@ -533,12 +539,10 @@ def from_pipe( pipe.evaluate_gen() if not isinstance(pipe.gen, Iterator): raise PipeGenInvalid(pipe.name, pipe.gen) + # create extractor - extract = cls(max_parallel_items, workers, futures_poll_interval, next_item_mode) - # add as first source - extract._sources.append(SourcePipeItem(pipe.gen, 0, pipe, None)) - cls._initial_sources_count = 1 - return extract + sources = [SourcePipeItem(pipe.gen, 0, pipe, None)] + return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) @classmethod @with_config(spec=PipeIteratorConfiguration) @@ -554,7 +558,8 @@ def from_pipes( next_item_mode: TPipeNextItemMode = "fifo", ) -> "PipeIterator": # print(f"max_parallel_items: {max_parallel_items} workers: {workers}") - extract = cls(max_parallel_items, workers, futures_poll_interval, next_item_mode) + sources: List[SourcePipeItem] = [] + # clone all pipes before iterating (recursively) as we will fork them (this add steps) and evaluate gens pipes, _ = PipeIterator.clone_pipes(pipes) @@ -574,18 +579,16 @@ def _fork_pipeline(pipe: Pipe) -> None: if not isinstance(pipe.gen, Iterator): raise PipeGenInvalid(pipe.name, pipe.gen) # add every head as source only once - if not any(i.pipe == pipe for i in extract._sources): - extract._sources.append(SourcePipeItem(pipe.gen, 0, pipe, None)) + if not any(i.pipe == pipe for i in sources): + sources.append(SourcePipeItem(pipe.gen, 0, pipe, None)) # reverse pipes for current mode, as we start processing from the back - if next_item_mode == "fifo": - pipes.reverse() + pipes.reverse() for pipe in pipes: _fork_pipeline(pipe) - extract._initial_sources_count = len(extract._sources) - - return extract + # create extractor + return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) def __next__(self) -> PipeItem: pipe_item: Union[ResolvablePipeItem, SourcePipeItem] = None @@ -619,6 +622,16 @@ def __next__(self) -> PipeItem: pipe_item = None continue + # handle async iterator items as new source + if isinstance(item, AsyncIterator): + self._sources.append( + SourcePipeItem( + wrap_async_iterator(item), pipe_item.step, pipe_item.pipe, pipe_item.meta + ), + ) + pipe_item = None + continue + if isinstance(item, Awaitable) or callable(item): # do we have a free slot or one of the slots is done? if len(self._futures) < self.max_parallel_items or self._next_future() >= 0: @@ -689,20 +702,25 @@ def close(self) -> None: def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: loop.stop() - # stop all futures - for f, _, _, _ in self._futures: - if not f.done(): - f.cancel() - self._futures.clear() - # close all generators for gen, _, _, _ in self._sources: if inspect.isgenerator(gen): gen.close() self._sources.clear() - # print("stopping loop") + # stop all futures + for f, _, _, _ in self._futures: + if not f.done(): + f.cancel() + + # let tasks cancel if self._async_pool: + # wait for all async generators to be closed + future = asyncio.run_coroutine_threadsafe( + self._async_pool.shutdown_asyncgens(), self._ensure_async_pool() + ) + while not future.done(): + sleep(self.futures_poll_interval) self._async_pool.call_soon_threadsafe(stop_background_loop, self._async_pool) # print("joining thread") self._async_pool_thread.join() @@ -712,6 +730,8 @@ def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: self._thread_pool.shutdown(wait=True) self._thread_pool = None + self._futures.clear() + def _ensure_async_pool(self) -> asyncio.AbstractEventLoop: # lazily create async pool is separate thread if self._async_pool: @@ -773,6 +793,8 @@ def _resolve_futures(self) -> ResolvablePipeItem: if future.exception(): ex = future.exception() + if isinstance(ex, StopAsyncIteration): + return None if isinstance( ex, (PipelineException, ExtractorException, DltSourceException, PipeException) ): @@ -780,84 +802,61 @@ def _resolve_futures(self) -> ResolvablePipeItem: raise ResourceExtractionError(pipe.name, future, str(ex), "future") from ex item = future.result() - if isinstance(item, DataItemWithMeta): + + # we also interpret future items that are None to not be value to be consumed + if item is None: + return None + elif isinstance(item, DataItemWithMeta): return ResolvablePipeItem(item.data, step, pipe, item.meta) else: return ResolvablePipeItem(item, step, pipe, meta) def _get_source_item(self) -> ResolvablePipeItem: - if self._next_item_mode == "fifo": - return self._get_source_item_current() - elif self._next_item_mode == "round_robin": - return self._get_source_item_round_robin() - - def _get_source_item_current(self) -> ResolvablePipeItem: - # no more sources to iterate - if len(self._sources) == 0: - return None - try: - # get items from last added iterator, this makes the overall Pipe as close to FIFO as possible - gen, step, pipe, meta = self._sources[-1] - # print(f"got {pipe.name}") - # register current pipe name during the execution of gen - set_current_pipe_name(pipe.name) - item = None - while item is None: - item = next(gen) - # full pipe item may be returned, this is used by ForkPipe step - # to redirect execution of an item to another pipe - if isinstance(item, ResolvablePipeItem): - return item - else: - # keep the item assigned step and pipe when creating resolvable item - if isinstance(item, DataItemWithMeta): - return ResolvablePipeItem(item.data, step, pipe, item.meta) - else: - return ResolvablePipeItem(item, step, pipe, meta) - except StopIteration: - # remove empty iterator and try another source - self._sources.pop() - return self._get_source_item() - except (PipelineException, ExtractorException, DltSourceException, PipeException): - raise - except Exception as ex: - raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex - - def _get_source_item_round_robin(self) -> ResolvablePipeItem: sources_count = len(self._sources) # no more sources to iterate if sources_count == 0: return None - # if there are currently more sources than added initially, we need to process the new ones first - if sources_count > self._initial_sources_count: - return self._get_source_item_current() try: - # print(f"got {pipe.name}") - # register current pipe name during the execution of gen - item = None - while item is None: - self._round_robin_index = (self._round_robin_index + 1) % sources_count - gen, step, pipe, meta = self._sources[self._round_robin_index] - set_current_pipe_name(pipe.name) - item = next(gen) - # full pipe item may be returned, this is used by ForkPipe step - # to redirect execution of an item to another pipe - if isinstance(item, ResolvablePipeItem): - return item + first_evaluated_index: int = None + # always reset to end of list for fifo mode, also take into account that new sources can be added + # if too many new sources is added we switch to fifo not to exhaust them + if ( + self._next_item_mode == "fifo" + or (sources_count - self._initial_sources_count) >= self.max_parallel_items + ): + self._current_source_index = sources_count - 1 else: - # keep the item assigned step and pipe when creating resolvable item - if isinstance(item, DataItemWithMeta): - return ResolvablePipeItem(item.data, step, pipe, item.meta) - else: - return ResolvablePipeItem(item, step, pipe, meta) + self._current_source_index = (self._current_source_index - 1) % sources_count + while True: + # if we have checked all sources once and all returned None, then we can sleep a bit + if self._current_source_index == first_evaluated_index: + sleep(self.futures_poll_interval) + # get next item from the current source + gen, step, pipe, meta = self._sources[self._current_source_index] + set_current_pipe_name(pipe.name) + if (item := next(gen)) is not None: + # full pipe item may be returned, this is used by ForkPipe step + # to redirect execution of an item to another pipe + if isinstance(item, ResolvablePipeItem): + return item + else: + # keep the item assigned step and pipe when creating resolvable item + if isinstance(item, DataItemWithMeta): + return ResolvablePipeItem(item.data, step, pipe, item.meta) + else: + return ResolvablePipeItem(item, step, pipe, meta) + # remember the first evaluated index + if first_evaluated_index is None: + first_evaluated_index = self._current_source_index + # always go round robin if None was returned + self._current_source_index = (self._current_source_index - 1) % sources_count except StopIteration: # remove empty iterator and try another source - self._sources.pop(self._round_robin_index) - # we need to decrease the index to keep the round robin order - self._round_robin_index -= 1 - # since in this case we have popped an initial source, we need to decrease the initial sources count - self._initial_sources_count -= 1 - return self._get_source_item_round_robin() + self._sources.pop(self._current_source_index) + # decrease initial source count if we popped an initial source + if self._current_source_index < self._initial_sources_count: + self._initial_sources_count -= 1 + return self._get_source_item() except (PipelineException, ExtractorException, DltSourceException, PipeException): raise except Exception as ex: diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 93c23e05a8..3d03486436 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -24,6 +24,7 @@ pipeline_state, ) from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id +from dlt.extract.utils import wrap_async_iterator from dlt.extract.typing import ( DataItemWithMeta, @@ -36,7 +37,7 @@ ValidateItem, ) from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep -from dlt.extract.hints import DltResourceHints, TResourceHints +from dlt.extract.hints import DltResourceHints, HintsMeta, TResourceHints from dlt.extract.incremental import Incremental, IncrementalResourceWrapper from dlt.extract.exceptions import ( InvalidTransformerDataTypeGeneratorFunctionRequired, @@ -45,7 +46,6 @@ InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, - InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, @@ -60,6 +60,15 @@ def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: return DataItemWithMeta(TableNameMeta(table_name), item) +def with_hints(item: TDataItems, hints: TResourceHints) -> DataItemWithMeta: + """Marks `item` to update the resource with specified `hints`. + + Create `TResourceHints` with `make_hints`. + Setting `table_name` will dispatch the `item` to a specified table, like `with_table_name` + """ + return DataItemWithMeta(HintsMeta(hints), item) + + class DltResource(Iterable[TDataItem], DltResourceHints): """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" @@ -123,8 +132,6 @@ def from_data( data = wrap_additional_type(data) # several iterable types are not allowed and must be excluded right away - if isinstance(data, (AsyncIterator, AsyncIterable)): - raise InvalidResourceDataTypeAsync(name, data, type(data)) if isinstance(data, (str, dict)): raise InvalidResourceDataTypeBasic(name, data, type(data)) @@ -135,7 +142,7 @@ def from_data( parent_pipe = DltResource._get_parent_pipe(name, data_from) # create resource from iterator, iterable or generator function - if isinstance(data, (Iterable, Iterator)) or callable(data): + if isinstance(data, (Iterable, Iterator, AsyncIterable)) or callable(data): pipe = Pipe.from_data(name, data, parent=parent_pipe) return cls( pipe, @@ -306,16 +313,26 @@ def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 def _gen_wrap(gen: TPipeStep) -> TPipeStep: """Wrap a generator to take the first `max_items` records""" - nonlocal max_items count = 0 + is_async_gen = False if inspect.isfunction(gen): gen = gen() + + # wrap async gen already here + if isinstance(gen, AsyncIterator): + gen = wrap_async_iterator(gen) + is_async_gen = True + try: for i in gen: # type: ignore # TODO: help me fix this later yield i - count += 1 - if count == max_items: - return + if i is not None: + count += 1 + # async gen yields awaitable so we must count one awaitable more + # so the previous one is evaluated and yielded. + # new awaitable will be cancelled + if count == max_items + int(is_async_gen): + return finally: if inspect.isgenerator(gen): gen.close() diff --git a/dlt/extract/source.py b/dlt/extract/source.py index b1f59f7bda..bc33394d4d 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -25,7 +25,7 @@ from dlt.extract.typing import TDecompositionStrategy from dlt.extract.pipe import Pipe, ManagedPipeIterator -from dlt.extract.hints import DltResourceHints +from dlt.extract.hints import DltResourceHints, make_hints from dlt.extract.resource import DltResource from dlt.extract.exceptions import ( DataItemRequiredForDynamicTableHints, @@ -64,7 +64,7 @@ def extracted(self) -> Dict[str, DltResource]: resource = self[pipe.name] except KeyError: # resource for pipe not found: return mock resource - mock_template = DltResourceHints.new_table_template( + mock_template = make_hints( pipe.name, write_disposition=resource.write_disposition ) resource = DltResource(pipe, mock_template, False, section=resource.section) diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 1db18ff47e..0e86994eb4 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -1,6 +1,19 @@ import inspect import makefun -from typing import Optional, Tuple, Union, List, Any, Sequence, cast +import asyncio +from typing import ( + Optional, + Tuple, + Union, + List, + Any, + Sequence, + cast, + AsyncIterator, + AsyncGenerator, + Awaitable, + Generator, +) from collections.abc import Mapping as C_Mapping from dlt.common.exceptions import MissingDependencyException @@ -119,6 +132,45 @@ def check_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature) -> in return meta_arg +def wrap_async_iterator( + gen: AsyncIterator[TDataItems], +) -> Generator[Awaitable[TDataItems], None, None]: + """Wraps an async generator into a list of awaitables""" + exhausted = False + busy = False + + # creates an awaitable that will return the next item from the async generator + async def run() -> TDataItems: + nonlocal exhausted + try: + # if marked exhausted by the main thread and we are wrapping a generator + # we can close it here + if exhausted: + raise StopAsyncIteration() + item = await gen.__anext__() + return item + # on stop iteration mark as exhausted + # also called when futures are cancelled + except StopAsyncIteration: + exhausted = True + raise + finally: + nonlocal busy + busy = False + + # this generator yields None while the async generator is not exhausted + try: + while not exhausted: + while busy: + yield None + busy = True + yield run() + # this gets called from the main thread when the wrapping generater is closed + except GeneratorExit: + # mark as exhausted + exhausted = True + + def wrap_compat_transformer( name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any ) -> AnyFun: @@ -142,8 +194,12 @@ def wrap_resource_gen( name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any ) -> AnyFun: """Wraps a generator or generator function so it is evaluated on extraction""" - if inspect.isgeneratorfunction(inspect.unwrap(f)) or inspect.isgenerator(f): - # always wrap generators and generator functions. evaluate only at runtime! + + if ( + inspect.isgeneratorfunction(inspect.unwrap(f)) + or inspect.isgenerator(f) + or inspect.isasyncgenfunction(f) + ): def _partial() -> Any: # print(f"_PARTIAL: {args} {kwargs}") diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index c72118cfc9..1391a26562 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -25,7 +25,8 @@ import dlt from dlt.common import logger -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.schema.typing import TWriteDisposition, TSchemaContract from dlt.common.utils import uniq_id from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext @@ -137,6 +138,8 @@ def add_run( decompose: Literal["none", "serialize"] = "none", table_name: str = None, write_disposition: TWriteDisposition = None, + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None, **kwargs: Any, ) -> List[PythonOperator]: """Creates a task or a group of tasks to run `data` with `pipeline` @@ -151,10 +154,13 @@ def add_run( Args: pipeline (Pipeline): An instance of pipeline used to run the source data (Any): Any data supported by `run` method of the pipeline - decompose (Literal["none", "serialize"], optional): A source decomposition strategy into Airflow tasks. Defaults to "none". + decompose (Literal["none", "serialize"], optional): A source decomposition strategy into Airflow tasks. Defaults to "none". table_name: (str): The name of the table to which the data should be loaded within the `dataset` write_disposition (TWriteDisposition, optional): Same as in `run` command. Defaults to None. - + loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional): The file format the loader will use to create the load package. + Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. + schema_contract (TSchemaContract, optional): On override for the schema contract settings, + this will replace the schema contract settings for all tables in the schema. Defaults to None. Returns: Any: Airflow tasks created in order of creation """ @@ -232,17 +238,27 @@ def log_after_attempt(retry_state: RetryCallState) -> None: % attempt.retry_state.attempt_number ) load_info = task_pipeline.run( - data, table_name=table_name, write_disposition=write_disposition + data, + table_name=table_name, + write_disposition=write_disposition, + loader_file_format=loader_file_format, + schema_contract=schema_contract, ) logger.info(str(load_info)) # save load and trace if self.save_load_info: logger.info("Saving the load info in the destination") - task_pipeline.run([load_info], table_name="_load_info") + task_pipeline.run( + [load_info], + table_name="_load_info", + loader_file_format=loader_file_format, + ) if self.save_trace_info: logger.info("Saving the trace in the destination") task_pipeline.run( - [task_pipeline.last_trace], table_name="_trace" + [task_pipeline.last_trace], + table_name="_trace", + loader_file_format=loader_file_format, ) # raise on failed jobs if requested if self.fail_task_if_any_job_failed: diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index 2414222cbd..8f2ad22585 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -141,4 +141,33 @@ athena: schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" database: "{{ env_var('DLT__AWS_DATA_CATALOG') }}" # aws_profile_name: "{{ env_var('DLT__CREDENTIALS__PROFILE_NAME', '') }}" - work_group: "{{ env_var('DLT__ATHENA_WORK_GROUP', '') }}" \ No newline at end of file + work_group: "{{ env_var('DLT__ATHENA_WORK_GROUP', '') }}" + + +# commented out because dbt for Synapse isn't currently properly supported. +# Leave config here for potential future use. +# synapse: +# target: analytics +# outputs: +# analytics: +# type: synapse +# driver: "{{ env_var('DLT__CREDENTIALS__DRIVER') }}" +# server: "{{ env_var('DLT__CREDENTIALS__HOST') }}" +# port: "{{ env_var('DLT__CREDENTIALS__PORT') | as_number }}" +# database: "{{ env_var('DLT__CREDENTIALS__DATABASE') }}" +# schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" +# user: "{{ env_var('DLT__CREDENTIALS__USERNAME') }}" +# password: "{{ env_var('DLT__CREDENTIALS__PASSWORD') }}" + + +databricks: + target: analytics + outputs: + analytics: + type: databricks + catalog: "{{ env_var('DLT__CREDENTIALS__CATALOG') }}" + schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" + host: "{{ env_var('DLT__CREDENTIALS__SERVER_HOSTNAME') }}" + http_path: "{{ env_var('DLT__CREDENTIALS__HTTP_PATH') }}" + token: "{{ env_var('DLT__CREDENTIALS__ACCESS_TOKEN') }}" + threads: 4 diff --git a/dlt/pipeline/current.py b/dlt/pipeline/current.py index f915a30932..7fdc0f095c 100644 --- a/dlt/pipeline/current.py +++ b/dlt/pipeline/current.py @@ -1,11 +1,14 @@ """Easy access to active pipelines, state, sources and schemas""" -from dlt.common.pipeline import source_state as _state, resource_state +from dlt.common.pipeline import source_state as _state, resource_state, get_current_pipe_name from dlt.pipeline import pipeline as _pipeline -from dlt.extract.decorators import get_source_schema +from dlt.extract.decorators import get_source_schema, get_source pipeline = _pipeline """Alias for dlt.pipeline""" state = source_state = _state """Alias for dlt.state""" source_schema = get_source_schema +source = get_source +pipe_name = get_current_pipe_name +resource_name = get_current_pipe_name diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 3b9b3ccfc7..0aba0e19ae 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,2 @@ -"""Module with market functions that make data to be specially processed""" -from dlt.extract import with_table_name +"""Module with mark functions that make data to be specially processed""" +from dlt.extract import with_table_name, with_hints, make_hints diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 73c8f076d1..3fa8da6aee 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1163,9 +1163,9 @@ def _set_context(self, is_active: bool) -> None: # set destination context on activation if self.destination: # inject capabilities context - self._container[DestinationCapabilitiesContext] = ( - self._get_destination_capabilities() - ) + self._container[ + DestinationCapabilitiesContext + ] = self._get_destination_capabilities() else: # remove destination context on deactivation if DestinationCapabilitiesContext in self._container: diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py index 5b767f0eb6..f7c5849e57 100644 --- a/docs/examples/chess_production/chess.py +++ b/docs/examples/chess_production/chess.py @@ -74,12 +74,12 @@ def load_data_with_retry(pipeline, data): load_info = pipeline.run(data) logger.info(str(load_info)) - # raise on failed jobs - load_info.raise_on_failed_jobs() - # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" - ) + # raise on failed jobs + load_info.raise_on_failed_jobs() + # send notification + send_slack_message( + pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" + ) except Exception: # we get here after all the failed retries # send notification diff --git a/docs/examples/connector_x_arrow/load_arrow.py b/docs/examples/connector_x_arrow/load_arrow.py index b3c654cef9..307e657514 100644 --- a/docs/examples/connector_x_arrow/load_arrow.py +++ b/docs/examples/connector_x_arrow/load_arrow.py @@ -19,13 +19,13 @@ def read_sql_x( def genome_resource(): # create genome resource with merge on `upid` primary key genome = dlt.resource( - name="genome", + name="acanthochromis_polyacanthus", write_disposition="merge", - primary_key="upid", + primary_key="analysis_id", standalone=True, )(read_sql_x)( - "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] - "SELECT * FROM genome ORDER BY created LIMIT 1000", + "mysql://anonymous@ensembldb.ensembl.org:3306/acanthochromis_polyacanthus_core_100_1", # type: ignore[arg-type] + "SELECT * FROM analysis LIMIT 20", ) # add incremental on created at genome.apply_hints(incremental=dlt.sources.incremental("created")) diff --git a/docs/examples/google_sheets/.dlt/config.toml b/docs/examples/google_sheets/.dlt/config.toml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/google_sheets/.dlt/example.secrets.toml b/docs/examples/google_sheets/.dlt/example.secrets.toml new file mode 100644 index 0000000000..42feceddfc --- /dev/null +++ b/docs/examples/google_sheets/.dlt/example.secrets.toml @@ -0,0 +1,16 @@ +# you can just paste services.json as credentials +[sources.google_sheets] +credentials=''' +{ +"type": "set me up!", +"project_id": "set me up!", +"private_key_id": "set me up!", +"private_key": "set me up!", +"client_email": "set me up!", +"client_id": "set me up!", +"auth_uri": "https://accounts.google.com/o/oauth2/auth", +"token_uri": "https://oauth2.googleapis.com/token", +"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", +"client_x509_cert_url": "set me up!" +} +''' \ No newline at end of file diff --git a/docs/examples/google_sheets/__init__.py b/docs/examples/google_sheets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/google_sheets/google_sheets.py b/docs/examples/google_sheets/google_sheets.py new file mode 100644 index 0000000000..1ba330e4ca --- /dev/null +++ b/docs/examples/google_sheets/google_sheets.py @@ -0,0 +1,73 @@ +from typing import Any, Iterator, Sequence, Union, cast + +from googleapiclient.discovery import build + +import dlt +from dlt.common.configuration.specs import ( + GcpOAuthCredentials, + GcpServiceAccountCredentials, +) +from dlt.common.typing import DictStrAny, StrAny + + +def _initialize_sheets( + credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials] +) -> Any: + # Build the service object. + service = build("sheets", "v4", credentials=credentials.to_native_credentials()) + return service + + +@dlt.source +def google_spreadsheet( + spreadsheet_id: str, + sheet_names: Sequence[str], + credentials: Union[ + GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny + ] = dlt.secrets.value, +) -> Any: + sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials)) + + def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: + # get list of list of typed values + result = ( + sheets.spreadsheets() + .values() + .get( + spreadsheetId=spreadsheet_id, + range=sheet_name, + # unformatted returns typed values + valueRenderOption="UNFORMATTED_VALUE", + # will return formatted dates + dateTimeRenderOption="FORMATTED_STRING", + ) + .execute() + ) + + # pprint.pprint(result) + values = result.get("values") + + # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length + for v in values[1:]: + yield {h: v for h, v in zip(values[0], v)} + + # create resources from supplied sheet names + return [ + dlt.resource(get_sheet(name), name=name, write_disposition="replace") + for name in sheet_names + ] + + +if __name__ == "__main__": + pipeline = dlt.pipeline(destination="duckdb") + # see example.secrets.toml to where to put credentials + sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" + range_names = ["hidden_columns_merged_cells", "Blank Columns"] + # "2022-05", "model_metadata" + info = pipeline.run( + google_spreadsheet( + spreadsheet_id=sheet_id, + sheet_names=range_names, + ) + ) + print(info) diff --git a/docs/examples/pdf_to_weaviate/__init__.py b/docs/examples/pdf_to_weaviate/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py new file mode 100644 index 0000000000..e7f57853ed --- /dev/null +++ b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py @@ -0,0 +1,57 @@ +import os + +import dlt +from dlt.destinations.impl.weaviate import weaviate_adapter +from PyPDF2 import PdfReader + + +@dlt.resource(selected=False) +def list_files(folder_path: str): + folder_path = os.path.abspath(folder_path) + for filename in os.listdir(folder_path): + file_path = os.path.join(folder_path, filename) + yield { + "file_name": filename, + "file_path": file_path, + "mtime": os.path.getmtime(file_path), + } + + +@dlt.transformer(primary_key="page_id", write_disposition="merge") +def pdf_to_text(file_item, separate_pages: bool = False): + if not separate_pages: + raise NotImplementedError() + # extract data from PDF page by page + reader = PdfReader(file_item["file_path"]) + for page_no in range(len(reader.pages)): + # add page content to file item + page_item = dict(file_item) + page_item["text"] = reader.pages[page_no].extract_text() + page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) + yield page_item + + +pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") + +# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" +# (3) sends them to pdf_to_text transformer with pipe (|) operator +pdf_pipeline = list_files("assets/invoices").add_filter( + lambda item: item["file_name"].endswith(".pdf") +) | pdf_to_text(separate_pages=True) + +# set the name of the destination table to receive pages +# NOTE: Weaviate, dlt's tables are mapped to classes +pdf_pipeline.table_name = "InvoiceText" + +# use weaviate_adapter to tell destination to vectorize "text" column +load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) +row_counts = pipeline.last_trace.last_normalize_info +print(row_counts) +print("------") +print(load_info) + +import weaviate + +client = weaviate.Client("http://localhost:8080") +# get text of all the invoices in InvoiceText class we just created above +print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) diff --git a/docs/website/docs/.dlt/.gitignore b/docs/website/docs/.dlt/.gitignore new file mode 100644 index 0000000000..da95bc542a --- /dev/null +++ b/docs/website/docs/.dlt/.gitignore @@ -0,0 +1 @@ +/secrets.toml diff --git a/docs/website/docs/conftest.py b/docs/website/docs/conftest.py index 49391c0ebe..87ccffe53b 100644 --- a/docs/website/docs/conftest.py +++ b/docs/website/docs/conftest.py @@ -52,6 +52,6 @@ def _initial_providers(): def pytest_configure(config): # push sentry to ci - os.environ[ - "RUNTIME__SENTRY_DSN" - ] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + os.environ["RUNTIME__SENTRY_DSN"] = ( + "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index c275c23941..a2f57538ba 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -92,7 +92,7 @@ athena_work_group="my_workgroup" ## Data loading -Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via sql queries on athena, the returned data is read by +Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via SQL queries on athena, the returned data is read by scanning your bucket and reading all relevant parquet files in there. `dlt` internal tables are saved as Iceberg tables. diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 9b34450c12..f44aab20b7 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -118,15 +118,25 @@ When staging is enabled: > ❗ **Bigquery cannot load JSON columns from `parquet` files**. `dlt` will fail such jobs permanently. Switch to `jsonl` to load and parse JSON properly. ## Supported column hints + BigQuery supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE`). May be used with `datetime`, `date` data types and `bigint` and `double` if they contain valid UNIX timestamps. Only one column per table is supported and only when a new table is created. -* `cluster` - creates a cluster column(s). Many column per table are supported and only when a new table is created. + +* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE`). + May be used with `datetime`, `date` and `bigint` data types. + Only one column per table is supported and only when a new table is created. + For more information on BigQuery partitioning, read the [official docs](https://cloud.google.com/bigquery/docs/partitioned-tables). + + > ❗ `bigint` maps to BigQuery's **INT64** data type. + > Automatic partitioning requires converting an INT64 column to a UNIX timestamp, which `GENERATE_ARRAY` doesn't natively support. + > With a 10,000 partition limit, we can’t cover the full INT64 range. + > Instead, we set 86,400 second boundaries to enable daily partitioning. + > This captures typical values, but extremely large/small outliers go to an `__UNPARTITIONED__` catch-all partition. + +* `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ## Staging Support BigQuery supports gcs as a file staging destination. dlt will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. -```toml -``` Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md new file mode 100644 index 0000000000..120ebfb6cd --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -0,0 +1,111 @@ +--- + +title: Databricks +description: Databricks `dlt` destination +keywords: [Databricks, destination, data warehouse] + +--- + +# Databricks +*Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time and test environment* + +## Install dlt with Databricks +**To install the DLT library with Databricks dependencies:** +``` +pip install dlt[databricks] +``` + +## Setup Guide + +**1. Initialize a project with a pipeline that loads to Databricks by running** +``` +dlt init chess databricks +``` + +**2. Install the necessary dependencies for Databricks by running** +``` +pip install -r requirements.txt +``` +This will install dlt with **databricks** extra which contains Databricks Python dbapi client. + +**4. Enter your credentials into `.dlt/secrets.toml`.** + +This should have your connection parameters and your personal access token. + +It should now look like: + +```toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +access_token "MY_ACCESS_TOKEN" +catalog = "my_catalog" +``` + +## Write disposition +All write dispositions are supported + +## Data loading +Data is loaded using `INSERT VALUES` statements by default. + +Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. +For more information on staging, see the [staging support](#staging-support) section below. + +## Supported file formats +* [insert-values](../file-formats/insert-format.md) is used by default +* [jsonl](../file-formats/jsonl.md) supported when staging is enabled (see limitations below) +* [parquet](../file-formats/parquet.md) supported when staging is enabled + +The `jsonl` format has some limitations when used with Databricks: + +1. Compression must be disabled to load jsonl files in databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. +2. The following data types are not supported when using `jsonl` format with `databricks`: `decimal`, `complex`, `date`, `binary`. Use `parquet` if your data contains these types. +3. `bigint` data type with precision is not supported with `jsonl` format + + +## Staging support + +Databricks supports both Amazon S3 and Azure Blob Storage as staging locations. `dlt` will upload files in `parquet` format to the staging location and will instruct Databricks to load data from there. + +### Databricks and Amazon S3 + +Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on connecting your s3 bucket with the bucket_url and credentials. + +Example to set up Databricks with s3 as a staging destination: + +```python +import dlt + +# Create a dlt pipeline that will load +# chess player data to the Databricks destination +# via staging on s3 +pipeline = dlt.pipeline( + pipeline_name='chess_pipeline', + destination='databricks', + staging=dlt.destinations.filesystem('s3://your-bucket-name'), # add this to activate the staging location + dataset_name='player_data', +) +``` + +### Databricks and Azure Blob Storage + +Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) for details on connecting your Azure Blob Storage container with the bucket_url and credentials. + +Example to set up Databricks with Azure as a staging destination: + +```python +# Create a dlt pipeline that will load +# chess player data to the Databricks destination +# via staging on Azure Blob Storage +pipeline = dlt.pipeline( + pipeline_name='chess_pipeline', + destination='databricks', + staging=dlt.destinations.filesystem('az://your-container-name'), # add this to activate the staging location + dataset_name='player_data' +) +``` +### dbt support +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-databricks](https://github.com/databricks/dbt-databricks) + +### Syncing of `dlt` state +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index f8dd4c98c5..7b32132361 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -1,7 +1,9 @@ # Filesystem & buckets -Filesystem destination stores data in remote file systems and bucket storages like **S3**, **google storage** or **azure blob storage**. Underneath it uses [fsspec](https://github.com/fsspec/filesystem_spec) to abstract file operations. Its primary role is to be used as a staging for other destinations but you can also quickly build a data lake with it. +Filesystem destination stores data in remote file systems and bucket storages like **S3**, **google storage** or **azure blob storage**. +Underneath, it uses [fsspec](https://github.com/fsspec/filesystem_spec) to abstract file operations. +Its primary role is to be used as a staging for other destinations, but you can also quickly build a data lake with it. -> 💡 Please read the notes on the layout of the data files. Currently we are getting feedback on it. Please join our slack (icon at the top of the page) and help us to find the optimal layout. +> 💡 Please read the notes on the layout of the data files. Currently, we are getting feedback on it. Please join our Slack (icon at the top of the page) and help us find the optimal layout. ## Install dlt with filesystem **To install the DLT library with filesystem dependencies:** @@ -24,15 +26,15 @@ so pip does not fail on backtracking. ## Setup Guide -### 1. Initialize the dlt project +### 1. Initialise the dlt project -Let's start by initializing a new dlt project as follows: +Let's start by initialising a new dlt project as follows: ```bash dlt init chess filesystem ``` > 💡 This command will initialise your pipeline with chess as the source and the AWS S3 filesystem as the destination. -### 2. Setup bucket storage and credentials +### 2. Set up bucket storage and credentials #### AWS S3 The command above creates sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: @@ -50,13 +52,15 @@ aws_access_key_id = "please set me up!" # copy the access key here aws_secret_access_key = "please set me up!" # copy the secret access key here ``` -If you have your credentials stored in `~/.aws/credentials` just remove the **[destination.filesystem.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): +If you have your credentials stored in `~/.aws/credentials` just remove the **[destination.filesystem.credentials]** section above +and `dlt` will fall back to your **default** profile in local credentials. +If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): ```toml [destination.filesystem.credentials] profile_name="dlt-ci-user" ``` -You can also pass an aws region: +You can also pass an AWS region: ```toml [destination.filesystem.credentials] region_name="eu-central-1" @@ -64,15 +68,15 @@ region_name="eu-central-1" You need to create a S3 bucket and a user who can access that bucket. `dlt` is not creating buckets automatically. -1. You can create the S3 bucket in AWS console by clicking on "Create Bucket" in S3 and assigning appropriate name and permissions to the bucket. -2. Once the bucket is created you'll have the bucket URL. For example, If the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: +1. You can create the S3 bucket in the AWS console by clicking on "Create Bucket" in S3 and assigning the appropriate name and permissions to the bucket. +2. Once the bucket is created, you'll have the bucket URL. For example, If the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: ``` s3://dlt-ci-test-bucket ``` 3. To grant permissions to the user being used to access the S3 bucket, go to the IAM > Users, and click on “Add Permissions”. -4. Below you can find a sample policy that gives a minimum permissions required by `dlt` to a bucket we created above. The policy contains permissions to list files in a bucket, get, put and delete objects. **Remember to place your bucket name in Resource section of the policy!** +4. Below you can find a sample policy that gives a minimum permission required by `dlt` to a bucket we created above. The policy contains permissions to list files in a bucket, get, put and delete objects. **Remember to place your bucket name in Resource section of the policy!** ```json { @@ -113,10 +117,22 @@ aws_secret_access_key = "please set me up!" # copy the secret access key here endpoint_url = "https://.r2.cloudflarestorage.com" # copy your endpoint URL here ``` +##### Adding Additional Configuration + +To pass any additional arguments to `fsspec`, you may supply `kwargs` and `client_kwargs` in the config as a **stringified dictionary**: + +```toml +[destination.filesystem] +kwargs = '{"use_ssl": true}' +client_kwargs = '{"verify": "public.crt"}' +``` + #### Google Storage Run `pip install dlt[gs]` which will install `gcfs` package. -To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You'll see AWS credentials by default. Use google cloud credentials that you may know from [BigQuery destination](bigquery.md) +To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. +You'll see AWS credentials by default. +Use Google cloud credentials that you may know from [BigQuery destination](bigquery.md) ```toml [destination.filesystem] bucket_url = "gs://[your_bucket_name]" # replace with your bucket name, @@ -127,9 +143,9 @@ private_key = "private_key" # please set me up! client_email = "client_email" # please set me up! ``` -> 💡 Note that you can share the same credentials with BigQuery, just replace the **[destination.filesystem.credentials]** section with less specific one: **[destination.credentials]** which applies to both destinations +> 💡 Note that you can share the same credentials with BigQuery, replace the **[destination.filesystem.credentials]** section with less specific one: **[destination.credentials]** which applies to both destinations -if you have default google cloud credentials in your environment (ie. on cloud function) just remove the credentials sections above and `dlt` will fallback to the available default. +if you have default google cloud credentials in your environment (i.e. on cloud function) remove the credentials sections above and `dlt` will fall back to the available default. Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. @@ -150,22 +166,23 @@ azure_storage_account_key = "account_key" # please set me up! azure_storage_sas_token = "sas_token" # please set me up! ``` -If you have the correct Azure credentials set up on your machine (e.g. via azure cli) you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fallback to the available default. +If you have the correct Azure credentials set up on your machine (e.g. via azure cli), +you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. #### Local file system -If for any reason you want to have those files in local folder, setup the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) +If for any reason you want to have those files in local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) ```toml [destination.filesystem] bucket_url = "file:///absolute/path" # three / for absolute path -# bucket_url = "file://relative/path" # two / for relative path +# bucket_url = "file://relative/path" # two / for a relative path ``` ## Write disposition `filesystem` destination handles the write dispositions as follows: - `append` - files belonging to such tables are added to dataset folder -- `replace` - all files that belong to such tables are deleted from dataset folder and then current set of files is added. +- `replace` - all files that belong to such tables are deleted from dataset folder, and then the current set of files is added. - `merge` - falls back to `append` ## Data loading @@ -177,10 +194,10 @@ All the files are stored in a single folder with the name of the dataset that yo The name of each file contains essential metadata on the content: -- **schema_name** and **table_name** identify the [schema](../../general-usage/schema.md) and table that define the file structure (column names, data types etc.) +- **schema_name** and **table_name** identify the [schema](../../general-usage/schema.md) and table that define the file structure (column names, data types, etc.) - **load_id** is the [id of the load package](../../general-usage/destination-tables.md#load-packages-and-load-ids) form which the file comes from. - **file_id** is there are many files with data for a single table, they are copied with different file id. -- **ext** a format of the file ie. `jsonl` or `parquet` +- **ext** a format of the file i.e. `jsonl` or `parquet` Current default layout: **{table_name}/{load_id}.{file_id}.{ext}`** @@ -195,9 +212,9 @@ layout="{table_name}/{load_id}.{file_id}.{ext}" # current preconfigured naming s ``` A few things to know when specifying your filename layout: -- If you want a different basepath that is common to all filenames, you can suffix your `bucket_url` rather than prefix your `layout` setting. +- If you want a different base path that is common to all filenames, you can suffix your `bucket_url` rather than prefix your `layout` setting. - If you do not provide the `{ext}` placeholder, it will automatically be added to your layout at the end with a dot as separator. -- It is best practice to have a separator between each placeholder. Separators can be any character allowed as a filename character but dots, dashes and forward slashes are most common. +- It is the best practice to have a separator between each placeholder. Separators can be any character allowed as a filename character, but dots, dashes and forward slashes are most common. - When you are using the `replace` disposition, `dlt`` will have to be able to figure out the correct files to delete before loading the new data. For this to work, you have to - include the `{table_name}` placeholder in your layout @@ -206,7 +223,7 @@ to work, you have to Please note: - `dlt` will not dump the current schema content to the bucket -- `dlt` will mark complete loads by creating an empty file that corresponds to `_dlt_loads` table. For example if `chess._dlt_loads.1685299832` file is present in dataset folders, you can be sure that all files for the load package `1685299832` are completely loaded +- `dlt` will mark complete loads by creating an empty file that corresponds to `_dlt_loads` table. For example, if `chess._dlt_loads.1685299832` file is present in dataset folders, you can be sure that all files for the load package `1685299832` are completely loaded ## Supported file formats You can choose the following file formats: diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index ff66d67d55..e98f8bf256 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -16,16 +16,14 @@ pip install dlt[mssql] ### Prerequisites -Microsoft ODBC driver for SQL Server must be installed to use this destination. -This can't be included with `dlt`s python dependencies so you must installed it separately on your system. +_Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. +This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). -See instructions here to [install Microsoft ODBC Driver 18 for SQL Server on Windows, Mac and Linux](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16) +Supported driver versions: +* `ODBC Driver 18 for SQL Server` +* `ODBC Driver 17 for SQL Server` -Following ODBC drivers are supported: -* ODBC Driver 18 for SQL Server -* ODBC Driver 17 for SQL Server - -[You can configure driver name explicitly](#additional-destination-options) as well. +You can [configure driver name](#additional-destination-options) explicitly as well. ### Create a pipeline @@ -42,7 +40,7 @@ or run: ``` pip install dlt[mssql] ``` -This will install dlt with **mssql** extra which contains all the dependencies required by the sql server client. +This will install dlt with **mssql** extra which contains all the dependencies required by the SQL server client. **3. Enter your credentials into `.dlt/secrets.toml`.** diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md new file mode 100644 index 0000000000..8c1a7b29bc --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -0,0 +1,204 @@ +--- +title: Azure Synapse +description: Azure Synapse `dlt` destination +keywords: [synapse, destination, data warehouse] +--- + +# Synapse + +## Install dlt with Synapse +**To install the DLT library with Synapse dependencies:** +``` +pip install dlt[synapse] +``` + +## Setup guide + +### Prerequisites + +* **Microsoft ODBC Driver for SQL Server** + + _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. + This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). + + Supported driver versions: + * `ODBC Driver 18 for SQL Server` + + > 💡 Older driver versions don't properly work, because they don't support the `LongAsMax` keyword that got [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. +* **Azure Synapse Workspace and dedicated SQL pool** + + You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you don't have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). + +### Steps + +**1. Initialize a project with a pipeline that loads to Synapse by running** +``` +dlt init chess synapse +``` + +**2. Install the necessary dependencies for Synapse by running** +``` +pip install -r requirements.txt +``` +This will install `dlt` with the **synapse** extra that contains all dependencies required for the Synapse destination. + +**3. Create a loader user** + +Execute the following SQL statements to set up the [loader](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) user. Change the password and replace `yourpool` with the name of your dedicated SQL pool: +```sql +-- on master database, using a SQL admin account + +CREATE LOGIN loader WITH PASSWORD = 'your_loader_password'; +``` + +```sql +-- on yourpool database + +CREATE USER loader FOR LOGIN loader; + +-- DDL permissions +GRANT CREATE SCHEMA ON DATABASE :: yourpool TO loader; +GRANT CREATE TABLE ON DATABASE :: yourpool TO loader; +GRANT CREATE VIEW ON DATABASE :: yourpool TO loader; + +-- DML permissions +GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; -- only required when loading from staging Storage Account +``` + +Optionally, you can create a `WORKLOAD GROUP` and add the `loader` user as a member to manage [workload isolation](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation). See the [instructions](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) on setting up a loader user for an example of how to do this. + +**3. Enter your credentials into `.dlt/secrets.toml`.** + +Example, replace with your database connection info: +```toml +[destination.synapse.credentials] +database = "yourpool" +username = "loader" +password = "your_loader_password" +host = "your_synapse_workspace_name.sql.azuresynapse.net" +``` + +Equivalently, you can also pass a connection string as follows: + +```toml +# keep it at the top of your toml file! before any section starts +destination.synapse.credentials = "synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net/yourpool" +``` + +To pass credentials directly you can use the `credentials` argument of `dlt.destinations.synapse(...)`: +```python +pipeline = dlt.pipeline( + pipeline_name='chess', + destination=dlt.destinations.synapse( + credentials='synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net/yourpool' + ), + dataset_name='chess_data' +) +``` + +## Write disposition +All write dispositions are supported + +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables with an `ALTER SCHEMA ... TRANSFER` command. Please note that this operation is **not** atomic—it involves multiple DDL commands and Synapse does not support DDL transactions. + +## Data loading +Data is loaded via `INSERT` statements by default. + +> 💡 Multi-row `INSERT INTO ... VALUES` statements are **not** possible in Synapse, because it doesn't support the [Table Value Constructor](https://learn.microsoft.com/en-us/sql/t-sql/queries/table-value-constructor-transact-sql). `dlt` uses `INSERT INTO ... SELECT ... UNION` statements as described [here](https://stackoverflow.com/a/73579830) to work around this limitation. + +## Supported file formats +* [insert-values](../file-formats/insert-format.md) is used by default +* [parquet](../file-formats/parquet.md) is used when [staging](#staging-support) is enabled + +## Data type limitations +* **Synapse cannot load `TIME` columns from `parquet` files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime`, to load `TIME` columns. +* **Synapse does not have a complex/JSON/struct data type**. The `dlt` `complex` data type is mapped to the `nvarchar` type in Synapse. + +## Table index type +The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: + +```python +info = pipeline.run( + synapse_adapter( + data=your_resource, + table_index_type="clustered_columnstore_index", + ) +) +``` + +Possible values: +* `heap`: create [HEAP](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables) tables that do not have an index **(default)** +* `clustered_columnstore_index`: create [CLUSTERED COLUMNSTORE INDEX](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#clustered-columnstore-indexes) tables + + +> ❗ Important: +>* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)). +>* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`. +>* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions. +>* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense + for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). +>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. +>* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." +>* Child tables, if any, inherent the table index type of their parent table. + +## Supported column hints + +Synapse supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): + +* `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column +* `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column + +> ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to innacurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). + +## Staging support +Synapse supports Azure Blob Storage (both standard and [ADLS Gen2](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)) as a file staging destination. `dlt` first uploads Parquet files to the blob container, and then instructs Synapse to read the Parquet file and load its data into a Synapse table using the [COPY INTO](https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql) statement. + +Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to configure credentials for the staging destination. By default, `dlt` will use these credentials for both the write into the blob container, and the read from it to load into Synapse. Managed Identity authentication can be enabled through the `staging_use_msi` option (see [additional destination options](#additional-destination-options)). + +To run Synapse with staging on Azure Blob Storage: + +```python +# Create a dlt pipeline that will load +# chess player data to the snowflake destination +# via staging on Azure Blob Storage +pipeline = dlt.pipeline( + pipeline_name='chess_pipeline', + destination='synapse', + staging='filesystem', # add this to activate the staging location + dataset_name='player_data' +) +``` + +## Additional destination options +The following settings can optionally be configured: +```toml +[destination.synapse] +default_table_index_type = "heap" +create_indexes = "false" +staging_use_msi = "false" + +[destination.synapse.credentials] +port = "1433" +connect_timeout = 15 +``` + +`port` and `connect_timeout` can also be included in the connection string: + +```toml +# keep it at the top of your toml file! before any section starts +destination.synapse.credentials = "synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net:1433/yourpool?connect_timeout=15" +``` + +Descriptions: +- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. +- `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. +- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-priviliged role) assigned on the blob container if you set this option to `"true"`. +- `port` used for the ODBC connection. +- `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. + +### dbt support +Integration with [dbt](../transformations/dbt/dbt.md) is currently not supported. + +### Syncing of `dlt` state +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). + diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 01c16e15b4..b2b6b27fc3 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -1,16 +1,16 @@ --- -title: Transforming data with dbt +title: Transform the data with dbt description: Transforming the data loaded by a dlt pipeline with dbt keywords: [transform, dbt, runner] --- -# Transforming data using dbt +# Transform the data with dbt [dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows simple structuring of your transformations into DAGs. The benefits of using dbt include: - End-to-end cross-db compatibility for dlt→dbt pipelines. -- Easy to use by sql analysts, low learning curve. +- Easy to use by SQL analysts, low learning curve. - Highly flexible and configurable in usage, supports templating, can run backfills etc. - Supports testing and accelerates troubleshooting. diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index 3ca7777e02..6ab98090ba 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -1,10 +1,10 @@ --- -title: Transforming data with Pandas -description: Transforming the data loaded by a dlt pipeline with Pandas +title: Transform the data with Pandas +description: Transform the data loaded by a dlt pipeline with Pandas keywords: [transform, pandas] --- -# Transforming the data using Pandas +# Transform the data with Pandas You can fetch results of any SQL query as a dataframe. If the destination is supporting that natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index 37b73e1e9e..cc1576229b 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -1,10 +1,10 @@ --- -title: Transforming data with SQL -description: Transforming the data loaded by a dlt pipeline with SQL client +title: Transform the data with SQL +description: Transforming the data loaded by a dlt pipeline with the dlt SQL client keywords: [transform, sql] --- -# Transforming data using the `dlt` SQL client +# Transform the data using the `dlt` SQL client A simple alternative to dbt is to query the data using the `dlt` SQL client and then performing the transformations using Python. The `execute_sql` method allows you to execute any SQL statement, diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index a6b27b6d0a..570f4c2b80 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -82,8 +82,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -137,7 +136,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `airtable`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index cf4d4598b9..e4d838935a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -71,8 +71,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source guide](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -110,7 +109,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `asana`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index e3da6fae78..bf20056881 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -51,8 +51,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source.md) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source.md). ### Add credentials @@ -87,7 +86,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `chess_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index 2c2cf4304c..c01845600a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -116,8 +116,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credential @@ -174,7 +173,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 58f1a26499..23194bedf6 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -82,8 +82,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -126,7 +125,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index 3976cc022c..396a671b05 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -143,8 +143,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -230,7 +229,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `dlt_google_analytics_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index e68edbdd73..ea0acc6824 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -231,8 +231,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -319,7 +318,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `google_sheets_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Data types diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index 48886e54c5..a3a6c6d702 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -89,8 +89,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -131,7 +130,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index 1129bfb4dd..ae826d1754 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -66,8 +66,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -118,7 +117,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md new file mode 100644 index 0000000000..1d09907d0a --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -0,0 +1,193 @@ +--- +title: Kafka +description: dlt verified source for Confluent Kafka +keywords: [kafka api, kafka verified source, kafka] +--- + +# Kafka + +:::info Need help deploying these sources, or figuring out how to run them in your data stack? + +[Join our Slack community](https://join.slack.com/t/dlthub-community/shared_invite/zt-1n5193dbq-rCBmJ6p~ckpSFK4hCF2dYA) +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +::: + +[Kafka](https://www.confluent.io/) is an open-source distributed event streaming platform, organized +in the form of a log with message publishers and subscribers. +The Kafka `dlt` verified source loads data using Confluent Kafka API to the destination of your choice, +see a [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py). + +The resource that can be loaded: + +| Name | Description | +| ----------------- |--------------------------------------------| +| kafka_consumer | Extracts messages from Kafka topics | + +## Setup Guide + +### Grab Kafka cluster credentials + +1. Follow the [Kafka Setup](https://developer.confluent.io/get-started/python/#kafka-setup) to tweak a +project. +1. Follow the [Configuration](https://developer.confluent.io/get-started/python/#configuration) to +get the project credentials. + +### Initialize the verified source + +To get started with your data pipeline, follow these steps: + +1. Enter the following command: + + ```bash + dlt init kafka duckdb + ``` + + [This command](../../reference/command-line-interface) will initialize + [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py) + with Kafka as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) + as the [destination](../destinations). + +1. If you'd like to use a different destination, simply replace `duckdb` with the name of your + preferred [destination](../destinations). + +1. After running this command, a new directory will be created with the necessary files and + configuration settings to get started. + +For more information, read the +[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) + +### Add credentials + +1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive + information securely, like access tokens. Keep this file safe. + + Use the following format for service account authentication: + +```toml +[sources.kafka.credentials] +bootstrap_servers="web.address.gcp.confluent.cloud:9092" +group_id="test_group" +security_protocol="SASL_SSL" +sasl_mechanisms="PLAIN" +sasl_username="example_username" +sasl_password="example_secret" +``` + +2. Enter credentials for your chosen destination as per the [docs](../destinations/). + +## Run the pipeline + +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by + running the command: + + ```bash + pip install -r requirements.txt + ``` + +1. You're now ready to run the pipeline! To get started, run the following command: + + ```bash + python kafka_pipeline.py + ``` + +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using + the following command: + + ```bash + dlt pipeline show + ``` + +For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). + +## Sources and resources + +`dlt` works on the principle of [sources](../../general-usage/source) and +[resources](../../general-usage/resource). + +### Source `kafka_consumer` + +This function retrieves messages from the given Kafka topics. + +```python +@dlt.resource(name="kafka_messages", table_name=lambda msg: msg["_kafka"]["topic"], standalone=True) +def kafka_consumer( + topics: Union[str, List[str]], + credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value, + msg_processor: Optional[Callable[[Message], Dict[str, Any]]] = default_msg_processor, + batch_size: Optional[int] = 3000, + batch_timeout: Optional[int] = 3, + start_from: Optional[TAnyDateTime] = None, +) -> Iterable[TDataItem]: +``` + +`topics`: A list of Kafka topics to be extracted. + +`credentials`: By default, is initialized with the data from +the `secrets.toml`. May be used explicitly to pass an initialized +Kafka Consumer object. + +`msg_processor`: A function, which'll be used to process every message +read from the given topics before saving them in the destination. +Can be used explicitly to pass a custom processor. See the +[default processor](https://github.com/dlt-hub/verified-sources/blob/fe8ed7abd965d9a0ca76d100551e7b64a0b95744/sources/kafka/helpers.py#L14-L50) +as an example of how to implement processors. + +`batch_size`: The amount of messages to extract from the cluster +at once. Can be set to tweak performance. + +`batch_timeout`: The maximum timeout for a single batch reading +operation. Can be set to tweak performance. + +`start_from`: A timestamp, starting with which the messages must +be read. When passed, `dlt` asks the Kafka cluster for an offset, +actual for the given timestamp, and starts to read messages from +this offset. + + +## Customization + +### Create your own pipeline + + +1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: + + ```python + pipeline = dlt.pipeline( + pipeline_name="kafka", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) + dataset_name="kafka_data" # Use a custom name if desired + ) + ``` + +1. To extract several topics: + + ```python + topics = ["topic1", "topic2", "topic3"] + + source = kafka_consumer(topics) + pipeline.run(source, write_disposition="replace") + ``` + +1. To extract messages and process them in a custom way: + + ```python + def custom_msg_processor(msg: confluent_kafka.Message) -> Dict[str, Any]: + return { + "_kafka": { + "topic": msg.topic(), # required field + "key": msg.key().decode("utf-8"), + "partition": msg.partition(), + }, + "data": msg.value().decode("utf-8"), + } + + data = kafka_consumer("topic", msg_processor=custom_msg_processor) + pipeline.run(data) + ``` + +1. To extract messages, starting from a timestamp: + + ```python + data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15)) + pipeline.run(data) + ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 0dc7579ab1..5063c19cdb 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -59,8 +59,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credential @@ -118,7 +117,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index d3f3d210b2..b2841e2e43 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -130,8 +130,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -190,7 +189,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `local_mongo`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index 11fe30134a..4c6e878176 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -61,8 +61,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source) ### Add credentials @@ -104,7 +103,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `mux`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index dde1f65743..36f809d902 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -65,8 +65,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source) ### Add credentials @@ -109,7 +108,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `notion`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index fef8a98705..7e26999f3c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -68,8 +68,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source) ### Add credentials @@ -109,7 +108,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `pipedrive`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index 689b117fbc..bb09e53cc5 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -80,8 +80,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source) ### Add credentials @@ -127,7 +126,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `salesforce`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index d198c4427a..42cd795568 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -76,8 +76,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credential @@ -141,7 +140,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs/run-a-pipeline) +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 44638fb508..9de3ebe211 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -82,8 +82,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -128,7 +127,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `slack`, you may also use any custom name instead. - For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). + For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 3c01b53925..bb23132e9b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -132,8 +132,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index 0266a9eaac..9715885573 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -65,8 +65,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -121,7 +120,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `strapi`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 181a2a60ac..985d51616c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -71,8 +71,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -117,7 +116,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `stripe_analytics`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 3f2eeb1724..ab561c936a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -80,8 +80,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -138,7 +137,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `workable`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index e5556373cf..58153be31b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -175,8 +175,7 @@ To get started with your data pipeline, follow these steps: 1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. -For more information, read the -[Walkthrough: Add a verified source.](../../walkthroughs/add-a-verified-source) +For more information, read the guide on [how to add a verified source.](../../walkthroughs/add-a-verified-source). ### Add credentials @@ -236,7 +235,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage For example, the `pipeline_name` for the above pipeline example is `dlt_zendesk_pipeline`, you may also use any custom name instead. -For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index 25b5aa3e1c..c61805423b 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -1,10 +1,10 @@ --- -title: Exploring the data -description: Exploring data that has been loaded +title: Explore the loaded data +description: How to explore the data that has been loaded keywords: [exploring, loaded data, data quality] --- -# Exploring the data +# Explore the loaded data Once you have run a pipeline locally, you can launch a web app that displays the loaded data. diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index b812e47ef8..f372d26d80 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -110,12 +110,12 @@ def load_data_with_retry(pipeline, data): load_info = pipeline.run(data) logger.info(str(load_info)) - # raise on failed jobs - load_info.raise_on_failed_jobs() - # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" - ) + # raise on failed jobs + load_info.raise_on_failed_jobs() + # send notification + send_slack_message( + pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" + ) except Exception: # we get here after all the failed retries # send notification diff --git a/docs/website/docs/examples/connector_x_arrow/index.md b/docs/website/docs/examples/connector_x_arrow/index.md index 6702b8bbef..92941e1988 100644 --- a/docs/website/docs/examples/connector_x_arrow/index.md +++ b/docs/website/docs/examples/connector_x_arrow/index.md @@ -56,13 +56,13 @@ def read_sql_x( def genome_resource(): # create genome resource with merge on `upid` primary key genome = dlt.resource( - name="genome", + name="acanthochromis_polyacanthus", write_disposition="merge", - primary_key="upid", + primary_key="analysis_id", standalone=True, )(read_sql_x)( - "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] - "SELECT * FROM genome ORDER BY created LIMIT 1000", + "mysql://anonymous@ensembldb.ensembl.org:3306/acanthochromis_polyacanthus_core_100_1", # type: ignore[arg-type] + "SELECT * FROM analysis LIMIT 20", ) # add incremental on created at genome.apply_hints(incremental=dlt.sources.incremental("created")) diff --git a/docs/website/docs/examples/google_sheets/__init__.py b/docs/website/docs/examples/google_sheets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/google_sheets/code/.dlt/config.toml b/docs/website/docs/examples/google_sheets/code/.dlt/config.toml new file mode 100644 index 0000000000..be627e6c11 --- /dev/null +++ b/docs/website/docs/examples/google_sheets/code/.dlt/config.toml @@ -0,0 +1,2 @@ +# @@@DLT_SNIPPET_START example +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml b/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml new file mode 100644 index 0000000000..cae98dc492 --- /dev/null +++ b/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml @@ -0,0 +1,18 @@ +# @@@DLT_SNIPPET_START example +# you can just paste services.json as credentials +[sources.google_sheets] +credentials=''' +{ + "type": "set me up!", + "project_id": "set me up!", + "private_key_id": "set me up!", + "private_key": "set me up!", + "client_email": "set me up!", + "client_id": "set me up!", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "set me up!" + } +''' +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/google_sheets/code/__init__.py b/docs/website/docs/examples/google_sheets/code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py b/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py new file mode 100644 index 0000000000..f56861e9e9 --- /dev/null +++ b/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py @@ -0,0 +1,88 @@ +from tests.utils import skipifgithubfork + +__source_name__ = "google_sheets" + + +@skipifgithubfork +def google_sheets_snippet() -> None: + # @@@DLT_SNIPPET_START example + # @@@DLT_SNIPPET_START google_sheets + from typing import Any, Iterator, Sequence, Union, cast + + from googleapiclient.discovery import build + + import dlt + from dlt.common.configuration.specs import ( + GcpOAuthCredentials, + GcpServiceAccountCredentials, + ) + from dlt.common.typing import DictStrAny, StrAny + + def _initialize_sheets( + credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials] + ) -> Any: + # Build the service object. + service = build("sheets", "v4", credentials=credentials.to_native_credentials()) + return service + + @dlt.source + def google_spreadsheet( + spreadsheet_id: str, + sheet_names: Sequence[str], + credentials: Union[ + GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny + ] = dlt.secrets.value, + ) -> Any: + sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials)) + + def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: + # get list of list of typed values + result = ( + sheets.spreadsheets() + .values() + .get( + spreadsheetId=spreadsheet_id, + range=sheet_name, + # unformatted returns typed values + valueRenderOption="UNFORMATTED_VALUE", + # will return formatted dates + dateTimeRenderOption="FORMATTED_STRING", + ) + .execute() + ) + + # pprint.pprint(result) + values = result.get("values") + + # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length + for v in values[1:]: + yield {h: v for h, v in zip(values[0], v)} + + # create resources from supplied sheet names + return [ + dlt.resource(get_sheet(name), name=name, write_disposition="replace") + for name in sheet_names + ] + + # @@@DLT_SNIPPET_END google_sheets + # @@@DLT_SNIPPET_START google_sheets_run + __name__ = "__main__" # @@@DLT_REMOVE + if __name__ == "__main__": + pipeline = dlt.pipeline(destination="duckdb") + # see example.secrets.toml to where to put credentials + sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" + range_names = ["hidden_columns_merged_cells", "Blank Columns"] + # "2022-05", "model_metadata" + info = pipeline.run( + google_spreadsheet( + spreadsheet_id=sheet_id, + sheet_names=range_names, + ) + ) + print(info) + # @@@DLT_SNIPPET_END google_sheets_run + # @@@DLT_SNIPPET_END example + row_counts = pipeline.last_trace.last_normalize_info.row_counts + print(row_counts.keys()) + assert row_counts["hidden_columns_merged_cells"] == 7 + assert row_counts["blank_columns"] == 21 diff --git a/docs/website/docs/examples/google_sheets/index.md b/docs/website/docs/examples/google_sheets/index.md new file mode 100644 index 0000000000..4af35f6dac --- /dev/null +++ b/docs/website/docs/examples/google_sheets/index.md @@ -0,0 +1,115 @@ +--- +title: Google Sheets minimal example +description: Learn how work with Google services +keywords: [google sheets, credentials, example] +--- + +import Header from '../_examples-header.md'; + +
+ +## Google Sheets data pipeline + +In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. + +We'll learn how to: +- use [built-in credentials](../../general-usage/credentials/config_specs#gcp-credentials); +- use [union of credentials](../../general-usage/credentials/config_specs#working-with-alternatives-of-credentials-union-types); +- create [dynamically generated resources](../../general-usage/source#create-resources-dynamically). + +:::tip +This example is for educational purposes. For best practices, we recommend using [Google Sheets verified source](../../dlt-ecosystem/verified-sources/google_sheets.md). +::: + +### Install Google client library + +```shell + pip install google-api-python-client +``` + +### Loading code + + +```py +from typing import Any, Iterator, Sequence, Union, cast + +from googleapiclient.discovery import build + +import dlt +from dlt.common.configuration.specs import ( + GcpOAuthCredentials, + GcpServiceAccountCredentials, +) +from dlt.common.typing import DictStrAny, StrAny + +def _initialize_sheets( + credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials] +) -> Any: + # Build the service object. + service = build("sheets", "v4", credentials=credentials.to_native_credentials()) + return service + +@dlt.source +def google_spreadsheet( + spreadsheet_id: str, + sheet_names: Sequence[str], + credentials: Union[ + GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny + ] = dlt.secrets.value, +) -> Any: + sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials)) + + def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: + # get list of list of typed values + result = ( + sheets.spreadsheets() + .values() + .get( + spreadsheetId=spreadsheet_id, + range=sheet_name, + # unformatted returns typed values + valueRenderOption="UNFORMATTED_VALUE", + # will return formatted dates + dateTimeRenderOption="FORMATTED_STRING", + ) + .execute() + ) + + # pprint.pprint(result) + values = result.get("values") + + # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length + for v in values[1:]: + yield {h: v for h, v in zip(values[0], v)} + + # create resources from supplied sheet names + return [ + dlt.resource(get_sheet(name), name=name, write_disposition="replace") + for name in sheet_names + ] +``` + + +### Run the pipeline + + +```py +if __name__ == "__main__": + pipeline = dlt.pipeline(destination="duckdb") + # see example.secrets.toml to where to put credentials + sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" + range_names = ["hidden_columns_merged_cells", "Blank Columns"] + # "2022-05", "model_metadata" + info = pipeline.run( + google_spreadsheet( + spreadsheet_id=sheet_id, + sheet_names=range_names, + ) + ) + print(info) +``` + diff --git a/docs/website/docs/examples/pdf_to_weaviate/__init__.py b/docs/website/docs/examples/pdf_to_weaviate/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/__init__.py b/docs/website/docs/examples/pdf_to_weaviate/code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/assets/invoices/invoice_2.txt b/docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_2.txt similarity index 100% rename from docs/website/docs/assets/invoices/invoice_2.txt rename to docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_2.txt diff --git a/docs/website/docs/assets/invoices/invoice_20230831-p1.pdf b/docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_20230831-p1.pdf similarity index 100% rename from docs/website/docs/assets/invoices/invoice_20230831-p1.pdf rename to docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_20230831-p1.pdf diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py b/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py new file mode 100644 index 0000000000..1ad7cc8159 --- /dev/null +++ b/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py @@ -0,0 +1,65 @@ +from tests.pipeline.utils import assert_load_info + + +def pdf_to_weaviate_snippet() -> None: + # @@@DLT_SNIPPET_START example + # @@@DLT_SNIPPET_START pdf_to_weaviate + import os + + import dlt + from dlt.destinations.impl.weaviate import weaviate_adapter + from PyPDF2 import PdfReader + + @dlt.resource(selected=False) + def list_files(folder_path: str): + folder_path = os.path.abspath(folder_path) + for filename in os.listdir(folder_path): + file_path = os.path.join(folder_path, filename) + yield { + "file_name": filename, + "file_path": file_path, + "mtime": os.path.getmtime(file_path), + } + + @dlt.transformer(primary_key="page_id", write_disposition="merge") + def pdf_to_text(file_item, separate_pages: bool = False): + if not separate_pages: + raise NotImplementedError() + # extract data from PDF page by page + reader = PdfReader(file_item["file_path"]) + for page_no in range(len(reader.pages)): + # add page content to file item + page_item = dict(file_item) + page_item["text"] = reader.pages[page_no].extract_text() + page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) + yield page_item + + pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") + + # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" + # (3) sends them to pdf_to_text transformer with pipe (|) operator + pdf_pipeline = list_files("assets/invoices").add_filter( + lambda item: item["file_name"].endswith(".pdf") + ) | pdf_to_text(separate_pages=True) + + # set the name of the destination table to receive pages + # NOTE: Weaviate, dlt's tables are mapped to classes + pdf_pipeline.table_name = "InvoiceText" + + # use weaviate_adapter to tell destination to vectorize "text" column + load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) + row_counts = pipeline.last_trace.last_normalize_info + print(row_counts) + print("------") + print(load_info) + # @@@DLT_SNIPPET_END pdf_to_weaviate + + # @@@DLT_SNIPPET_START pdf_to_weaviate_read + import weaviate + + client = weaviate.Client("http://localhost:8080") + # get text of all the invoices in InvoiceText class we just created above + print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) + # @@@DLT_SNIPPET_END pdf_to_weaviate_read + # @@@DLT_SNIPPET_END example + assert_load_info(load_info) diff --git a/docs/website/docs/examples/pdf_to_weaviate/index.md b/docs/website/docs/examples/pdf_to_weaviate/index.md new file mode 100644 index 0000000000..cc2ef01e33 --- /dev/null +++ b/docs/website/docs/examples/pdf_to_weaviate/index.md @@ -0,0 +1,113 @@ +--- +title: Load PDFs to Weaviate +description: Extract text from PDF and load it into a vector database +keywords: [pdf, weaviate, vector store, vector database, ] +--- + +import Header from '../_examples-header.md'; + +
+ +Additionally we'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: + +```shell +pip install PyPDF2 +``` + +## Example code + + +```py +import os + +import dlt +from dlt.destinations.impl.weaviate import weaviate_adapter +from PyPDF2 import PdfReader + +@dlt.resource(selected=False) +def list_files(folder_path: str): + folder_path = os.path.abspath(folder_path) + for filename in os.listdir(folder_path): + file_path = os.path.join(folder_path, filename) + yield { + "file_name": filename, + "file_path": file_path, + "mtime": os.path.getmtime(file_path), + } + +@dlt.transformer(primary_key="page_id", write_disposition="merge") +def pdf_to_text(file_item, separate_pages: bool = False): + if not separate_pages: + raise NotImplementedError() + # extract data from PDF page by page + reader = PdfReader(file_item["file_path"]) + for page_no in range(len(reader.pages)): + # add page content to file item + page_item = dict(file_item) + page_item["text"] = reader.pages[page_no].extract_text() + page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) + yield page_item + +pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") + +# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" +# (3) sends them to pdf_to_text transformer with pipe (|) operator +pdf_pipeline = list_files("assets/invoices").add_filter( + lambda item: item["file_name"].endswith(".pdf") +) | pdf_to_text(separate_pages=True) + +# set the name of the destination table to receive pages +# NOTE: Weaviate, dlt's tables are mapped to classes +pdf_pipeline.table_name = "InvoiceText" + +# use weaviate_adapter to tell destination to vectorize "text" column +load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) +row_counts = pipeline.last_trace.last_normalize_info +print(row_counts) +print("------") +print(load_info) +``` + + +We start with a simple resource that lists files in specified folder. To that we add a **filter** function that removes all files that are not pdfs. + +To parse PDFs we use [PyPDF](https://pypdf2.readthedocs.io/en/3.0.0/user/extract-text.html) and return each page from a given PDF as separate data item. + +Parsing happens in `@dlt.transformer` which receives data from `list_files` resource. It splits PDF into pages, extracts text and yields pages separately +so each PDF will correspond to many items in Weaviate `InvoiceText` class. We set the primary key and use merge disposition so if the same PDF comes twice +we'll just update the vectors, and not duplicate. + +Look how we pipe data from `list_files` resource (note that resource is deselected so we do not load raw file items to destination) into `pdf_to_text` using **|** operator. + +Just before load, the `weaviate_adapter` is used to tell `weaviate` destination which fields to vectorize. + +Now it is time to query our documents. + +```py +import weaviate + +client = weaviate.Client("http://localhost:8080") +# get text of all the invoices in InvoiceText class we just created above +print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) +``` + + +Above we provide URL to local cluster. We also use `contextionary` to vectorize data. You may find information on our setup in links below. + +:::tip + +Change the destination to `duckdb` if you do not have access to Weaviate cluster or not able to run it locally. + +::: + +Learn more: + +- [Setup Weaviate destination - local or cluster](dlt-ecosystem/destinations/weaviate.md). +- [Connect the transformers to the resources](general-usage/resource#feeding-data-from-one-resource-into-another) +to load additional data or enrich it. +- [Transform your data before loading](general-usage/resource#customize-resources) and see some + [examples of customizations like column renames and anonymization](general-usage/customising-pipelines/renaming_columns). diff --git a/docs/website/docs/examples/transformers/index.md b/docs/website/docs/examples/transformers/index.md index 7ed8fd29c3..860e830aae 100644 --- a/docs/website/docs/examples/transformers/index.md +++ b/docs/website/docs/examples/transformers/index.md @@ -9,7 +9,7 @@ import Header from '../_examples-header.md';
diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md new file mode 100644 index 0000000000..8493ffaec5 --- /dev/null +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -0,0 +1,91 @@ +--- +title: Removing columns +description: Removing columns by passing list of column names +keywords: [deleting, removing, columns, drop] +--- + +# Removing columns + +Removing columns before loading data into a database is a reliable method to eliminate sensitive or +unnecessary fields. For example, in the given scenario, a source is created with a "country_id" column, +which is then excluded from the database before loading. + +Let's create a sample pipeline demonstrating the process of removing a column. + +1. Create a source function that creates dummy data as follows: + + ```python + import dlt + + # This function creates a dummy data source. + @dlt.source + def dummy_source(): + @dlt.resource(write_disposition="replace") + def dummy_data(): + for i in range(3): + yield {"id": i, "name": f"Jane Washington {i}", "country_code": 40 + i} + + return dummy_data() + ``` + This function creates three columns `id`, `name` and `country_code`. + +1. Next, create a function to filter out columns from the data before loading it into a database as follows: + + ```python + from typing import Dict, List, Optional + + def remove_columns(doc: Dict, remove_columns: Optional[List[str]] = None) -> Dict: + if remove_columns is None: + remove_columns = [] + + # Iterating over the list of columns to be removed + for column_name in remove_columns: + # Removing the column if it exists in the document + if column_name in doc: + del doc[column_name] + + return doc + ``` + + `doc`: The document (dict) from which columns will be removed. + + `remove_columns`: List of column names to be removed, defaults to None. + +1. Next, declare the columns to be removed from the table, and then modify the source as follows: + + ```python + # Example columns to remove: + remove_columns_list = ["country_code"] + + # Create an instance of the source so you can edit it. + data_source = dummy_source() + + # Modify this source instance's resource + data_source = data_source.dummy_data.add_map( + lambda doc: remove_columns(doc, remove_columns_list) + ) + ``` +1. You can optionally inspect the result: + + ```python + for row in data_source: + print(row) + #{'id': 0, 'name': 'Jane Washington 0'} + #{'id': 1, 'name': 'Jane Washington 1'} + #{'id': 2, 'name': 'Jane Washington 2'} + ``` + +1. At last, create a pipeline: + + ```python + # Integrating with a DLT pipeline + pipeline = dlt.pipeline( + pipeline_name='example', + destination='bigquery', + dataset_name='filtered_data' + ) + # Run the pipeline with the transformed source + load_info = pipeline.run(data_source) + print(load_info) + ``` + diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md new file mode 100644 index 0000000000..6b09510f68 --- /dev/null +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -0,0 +1,266 @@ +--- +title: Currency-conversion data enrichment +description: Converting the monetary value in one currency to another using the latest market rates. +keywords: [data enrichment, currency conversion, latest market rates] +--- + +# Data enrichment part two: Currency conversion data enrichment + +Currency conversion data enrichment means adding additional information to currency-related data. +Often, you have a data set of monetary value in one currency. For various reasons such as reporting, +analysis, or global operations, it may be necessary to convert these amounts into different currencies. + +## Currency conversion process + +Here is step-by-step process for currency conversion data enrichment: + +1. Define base and target currencies. e.g., USD (base) to EUR (target). +1. Obtain current exchange rates from a reliable source like a financial data API. +1. Convert the monetary values at obtained exchange rates. +1. Include metadata like conversion rate, date, and time. +1. Save the updated dataset in a data warehouse or lake using a data pipeline. + +We use the [ExchangeRate-API](https://app.exchangerate-api.com/) to fetch the latest currency +conversion rates. However, you can use any service you prefer. + +:::note +ExchangeRate-API free tier offers 1500 free calls monthly. For production, consider +upgrading to a higher plan. +::: + +## Creating data enrichment pipeline + +You can either follow the example in the linked Colab notebook or follow this documentation to +create the currency conversion data enrichment pipeline. + +### A. Colab notebook + +The Colab notebook combines three data enrichment processes for a sample dataset, it's second part +contains "Data enrichment part two: Currency conversion data enrichment". + +Here's the link to the notebook: +**[Colab Notebook](https://colab.research.google.com/drive/1ZKEkf1LRSld7CWQFS36fUXjhJKPAon7P?usp=sharing).** + +### B. Create a pipeline + +Alternatively, to create a data enrichment pipeline, you can start by creating the following +directory structure: + +```text +currency_conversion_enrichment/ +├── .dlt/ +│ └── secrets.toml +└── currency_enrichment_pipeline.py +``` + +### 1. Creating resource + +`dlt` works on the principle of [sources](../../general-usage/source.md) and +[resources.](../../general-usage/resource.md) + +1. The last part of our data enrichment ([part one](../../general-usage/data-enrichments/user_agent_device_data_enrichment.md)) + involved enriching the data with user-agent device data. This included adding two new columns to the dataset as folows: + + - `device_price_usd`: average price of the device in USD. + + - `price_updated_at`: time at which the price was updated. + +1. The columns initially present prior to the data enrichment were: + + - `user_id`: Web trackers typically assign unique ID to users for tracking their journeys and + interactions over time. + + - `device_name`: User device information helps in understanding the user base's device. + + - `page_refer`: The referer URL is tracked to analyze traffic sources and user navigation + behavior. + +1. Here's the resource that yields the sample data as discussed above: + + ```python + @dlt.resource() + def enriched_data_part_two(): + data_enrichment_part_one = [ + { + "user_id": 1, + "device_name": "Sony Experia XZ", + "page_referer": "https://b2venture.lightning.force.com/", + "device_price_usd": 313.01, + "price_updated_at": "2024-01-15 04:08:45.088499+00:00" + }, + ] + """ + Similar data for the other users. + """ + for user_data in data_enrichment_part_one: + yield user_data + ``` + + > `data_enrichment_part_one` holds the enriched data from part one. It can also be directly used + > in part two as demonstrated in + > **[Colab Notebook](https://colab.research.google.com/drive/1ZKEkf1LRSld7CWQFS36fUXjhJKPAon7P?usp=sharing).** + +### 2. Create `converted_amount` function + +This function retrieves conversion rates for currency pairs that either haven't been fetched before +or were last updated more than 24 hours ago from the ExchangeRate-API, using information stored in +the `dlt` [state](../../general-usage/state.md). + +The first step is to register on [ExhangeRate-API](https://app.exchangerate-api.com/) and obtain the +API token. + +1. In the `.dlt`folder, there's a file called `secrets.toml`. It's where you store sensitive + information securely, like access tokens. Keep this file safe. Here's its format for service + account authentication: + + ```python + [sources] + api_key= "Please set me up!" #ExchangeRate-API key + ``` + +1. Create the `converted_amount` function as follows: + + ```python + # @transformer(data_from=enriched_data_part_two) + def converted_amount(record): + """ + Converts an amount from base currency to target currency using the latest exchange rate. + + This function retrieves the current exchange rate from an external API and + applies it to the specified amount in the record. It handles updates to the exchange rate + if the current rate is over 12 hours old. + + Args: + record (dict): A dictionary containing the 'amount' key with the value to be converted. + + Yields: + dict: A dictionary containing the original amount in USD, converted amount in EUR, + the exchange rate, and the last update time of the rate. + + Note: + The base currency (USD) and target currency (EUR) are hard coded in this function, + but that can be changed. + The API key is retrieved from the `dlt` secrets. + """ + # Hardcoded base and target currencies + base_currency = "USD" + target_currency = "EUR" + + # Retrieve the API key from DLT secrets + api_key = dlt.secrets.get("sources.api_key") + + # Initialize or retrieve the state for currency rates + rates_state = dlt.current.resource_state().setdefault("rates", {}) + currency_pair_key = f"{base_currency}-{target_currency}" + currency_pair_state = rates_state.setdefault(currency_pair_key, { + "last_update": datetime.min, + "rate": None + }) + + # Update the exchange rate if it's older than 12 hours + if (currency_pair_state.get("rate") is None or + (datetime.utcnow() - currency_pair_state["last_update"] >= timedelta(hours=12))): + url = f"https://v6.exchangerate-api.com/v6/{api_key}/pair/{base_currency}/{target_currency}" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + currency_pair_state.update({ + "rate": data.get("conversion_rate"), + "last_update": datetime.fromtimestamp(data.get("time_last_update_unix")) + }) + print(f"The latest rate of {data.get('conversion_rate')} for the currency pair {currency_pair_key} is fetched and updated.") + else: + raise Exception(f"Error fetching the exchange rate: HTTP {response.status_code}") + + # Convert the amount using the retrieved or stored exchange rate + amount = record['device_price_usd'] # Assuming the key is 'amount' as per the function documentation + rate = currency_pair_state["rate"] + yield { + "actual_amount": amount, + "base_currency": base_currency, + "converted_amount": round(amount * rate, 2), + "target_currency": target_currency, + "rate": rate, + "rate_last_updated": currency_pair_state["last_update"], + } + ``` +1. Next, follow the instructions in + [Destinations](../../dlt-ecosystem/destinations/duckdb.md) to add credentials for + your chosen destination. This will ensure that your data is properly routed to its final + destination. + +### 3. Create your pipeline + +1. In creating the pipeline, the `converted_amount` can be used in the following ways: + + - Add map function + - Transformer function + + The `dlt` library's `transformer` and `add_map` functions serve distinct purposes in data + processing. + + `Transformers` are a form of `dlt resource` that takes input from other resources + via `data_from` argument to enrich or transform the data. + [Click here.](../../general-usage/resource.md#process-resources-with-dlttransformer) + + Conversely, `add_map` used to customize a resource applies transformations at an item level + within a resource. It's useful for tasks like anonymizing individual data records. More on this + can be found under [Customize resources](../../general-usage/resource.md#customize-resources) in the + documentation. + +1. Here, we create the pipeline and use the `add_map` functionality: + + ```python + # Create the pipeline + pipeline = dlt.pipeline( + pipeline_name="data_enrichment_two", + destination="duckdb", + dataset_name="currency_conversion_enrichment", + ) + + # Run the pipeline with the transformed source + load_info = pipeline.run(enriched_data_part_two.add_map(converted_amount)) + + print(load_info) + ``` + + :::info + Please note that the same outcome can be achieved by using the `@dlt.transformer` decorator function. + To do so, you need to add the transformer decorator at the top of the `converted_amount` function. + For `pipeline.run`, you can use the following code: + + ```python + # using fetch_average_price as a transformer function + load_info = pipeline.run( + enriched_data_part_two | converted_amount, + table_name="data_enrichment_part_two" + ) + ``` + + This will execute the `converted_amount` function with the data enriched in part one and return + the converted currencies. + ::: + +### Run the pipeline + +1. Install necessary dependencies for the preferred + [destination](../../dlt-ecosystem/destinations/), For example, duckdb: + + ``` + pip install dlt[duckdb] + ``` + +1. Run the pipeline with the following command: + + ``` + python currency_enrichment_pipeline.py + ``` + +1. To ensure that everything loads as expected, use the command: + + ``` + dlt pipeline show + ``` + + For example, the "pipeline_name" for the above pipeline example is `data_enrichment_two`; you can + use any custom name instead. diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md new file mode 100644 index 0000000000..f4578d065f --- /dev/null +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -0,0 +1,250 @@ +--- +title: URL-parser data enrichment +description: Enriching the url with various parameters. +keywords: [data enrichment, url parser, referer data enrichment] +--- + +# Data enrichment part three: URL parser data enrichment + +URL parser data enrichment is extracting various URL components to gain additional insights and +context about the URL. This extracted information can be used for data analysis, marketing, SEO, and +more. + +## URL parsing process + +Here is step-by-step process for URL parser data enrichment : + +1. Get the URL data that is needed to be parsed from a source or create one. +1. Send the URL data to an API like [URL Parser API](https://urlparse.com/). +1. Get the parsed URL data. +1. Include metadata like conversion rate, date, and time. +1. Save the updated dataset in a data warehouse or lake using a data pipeline. + +We use **[URL Parse API](https://urlparse.com/)** to extract the information about the URL. However, +you can use any API you prefer. + +:::tip +`URL Parse API` is free, with 1000 requests/hour limit, which can be increased on request. +::: + +By default the URL Parse API will return a JSON response like: + +```text +{ + "authority": "urlparse.com", + "domain": "urlparse.com", + "domain_label": "urlparse", + "file": "/", + "fragment": null, + "host": "urlparse.com", + "href": "https://urlparse.com/", + "is_valid": true, + "origin": "https://urlparse.com", + "params": null, + "path": "/", + "port": null, + "query": null, + "request_url": "https://urlparse.com", + "scheme": "https", + "subdomains": null, + "tld": "com" +} +``` + +## Creating data enrichment pipeline + +You can either follow the example in the linked Colab notebook or follow this documentation to +create the URL-parser data enrichment pipeline. + +### A. Colab notebook + +This Colab notebook outlines a three-part data enrichment process for a sample dataset: + +- User-agent device data enrichment +- Currency conversion data enrichment +- URL-parser data enrichment + +This document focuses on the URL-Parser Data Enrichment (Part Three). For a comprehensive +understanding, you may explore all three enrichments sequentially in the notebook: +[Colab Notebook](https://colab.research.google.com/drive/1ZKEkf1LRSld7CWQFS36fUXjhJKPAon7P?usp=sharing). + +### B. Create a pipeline + +Alternatively, to create a data enrichment pipeline, you can start by creating the following +directory structure: + +```python +url_parser_enrichment/ +├── .dlt/ +│ └── secrets.toml +└── url_enrichment_pipeline.py +``` + +### 1. Creating resource + +`dlt` works on the principle of [sources](../../general-usage/source.md) and +[resources.](../../general-usage/resource.md) + +This data resource yields data typical of what many web analytics and tracking tools can collect. +However, the specifics of what data is collected and how it's used can vary significantly among +different tracking services. + +Let's examine a synthetic dataset created for this article. It includes: + +- `user_id`: Web trackers typically assign unique ID to users for tracking their journeys and + interactions over time. + +- `device_name`: User device information helps in understanding the user base's device. + +- `page_refer`: The referer URL is tracked to analyze traffic sources and user navigation behavior. + +Here's the resource that yields the sample data as discussed above: + +```python + import dlt + + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. + + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. + + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ + + # Sample data representing tracked user data + sample_data = [ + { + "user_id": 1, + "device_name": "Sony Experia XZ", + "page_referer": "https://b2venture.lightning.force.com/" + }, + """ + Data for other users + """ + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data +``` + +### 2. Create `url_parser` function + +We use a free service called [URL Parse API](https://urlparse.com/), to parse the urls. You don’t +need to register to use this service neither get an API key. + +1. Create a `url_parser` function as follows: + ```python + # @dlt.transformer(data_from=tracked_data) + def url_parser(record): + """ + Send a URL to a parsing service and return the parsed data. + + This function sends a URL to a specified API endpoint for URL parsing. + + Parameters: + url (str): The URL to be parsed. + + Returns: + dict: Parsed URL data in JSON format if the request is successful. + None: If the request fails (e.g., an invalid URL or server error). + """ + # Define the API endpoint URL for the URL parsing service + api_url = "https://api.urlparse.com/v1/query" + url = record['page_referer'] + # Send a POST request to the API with the URL to be parsed + response = requests.post(api_url, json={"url": url}) + + # Check if the response from the API is successful (HTTP status code 200) + if response.status_code == 200: + # If successful, return the parsed data in JSON format + return response.json() + else: + # If the request failed, print an error message with the status code and return None + print(f"Request for {url} failed with status code: {response.status_code}") + return None + ``` + +### 3. Create your pipeline + +1. In creating the pipeline, the `url_parser` can be used in the following ways: + + - Add map function + - Transformer function + + The `dlt` library's `transformer` and `add_map` functions serve distinct purposes in data + processing. + + `Transformers` are a form of `dlt resource` that takes input from other resources + via `data_from` argument to enrich or transform the data. + [Click here.](../../general-usage/resource.md#process-resources-with-dlttransformer) + + Conversely, `add_map` used to customize a resource applies transformations at an item level + within a resource. It's useful for tasks like anonymizing individual data records. More on this + can be found under [Customize resources](../../general-usage/resource.md#customize-resources) in + the documentation. + +1. Here, we create the pipeline and use the `add_map` functionality: + + ```python + # Create the pipeline + pipeline = dlt.pipeline( + pipeline_name="data_enrichment_three", + destination="duckdb", + dataset_name="user_device_enrichment", + ) + + # Run the pipeline with the transformed source + load_info = pipeline.run(tracked_data.add_map(url_parser)) + + print(load_info) + ``` + + :::info + Please note that the same outcome can be achieved by using the transformer function. To + do so, you need to add the transformer decorator at the top of the `url_parser` function. For + `pipeline.run`, you can use the following code: + + ```python + # using fetch_average_price as a transformer function + load_info = pipeline.run( + tracked_data | url_parser, + table_name="url_parser" + ) + ``` + + This will execute the `url_parser` function with the tracked data and return parsed URL. + ::: + +### Run the pipeline + +1. Install necessary dependencies for the preferred + [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: + + ``` + pip install dlt[duckdb] + ``` + +1. Run the pipeline with the following command: + + ``` + python url_enrichment_pipeline.py + ``` + +1. To ensure that everything loads as expected, use the command: + + ``` + dlt pipeline show + ``` + + For example, the "pipeline_name" for the above pipeline example is `data_enrichment_three`; you + can use any custom name instead. diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md new file mode 100644 index 0000000000..8b33a852a8 --- /dev/null +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -0,0 +1,305 @@ +--- +title: User-agent device data enrichment +description: Enriching the user-agent device data with average device price. +keywords: [data enrichment, user-agent data, device enrichment] +--- + +# Data enrichment part one: User-agent device data enrichment + +Data enrichment enhances raw data with valuable information from multiple sources, increasing its +analytical and decision-making value. + +This part covers enriching sample data with device price. Understanding the price segment +of the device that the user used to access your service can be helpful in personalized marketing, +customer segmentation, and many more. + +This documentation will discuss how to enrich the user device information with the average market +price. + +## Setup Guide + +We use SerpAPI to retrieve device prices using Google Shopping, but alternative services or APIs are +viable. + +:::note +SerpAPI free tier offers 100 free calls monthly. For production, consider upgrading to a higher +plan. +::: + + +## Creating data enrichment pipeline +You can either follow the example in the linked Colab notebook or follow this documentation to +create the user-agent device data enrichment pipeline. + +### A. Colab notebook +The Colab notebook combines three data enrichment processes for a sample dataset, starting with "Data +enrichment part one: User-agent device data". + +Here's the link to the notebook: +**[Colab Notebook](https://colab.research.google.com/drive/1ZKEkf1LRSld7CWQFS36fUXjhJKPAon7P?usp=sharing).** + +### B. Create a pipeline +Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: + +```python +user_device_enrichment/ +├── .dlt/ +│ └── secrets.toml +└── device_enrichment_pipeline.py +``` +### 1. Creating resource + + `dlt` works on the principle of [sources](https://dlthub.com/docs/general-usage/source) + and [resources.](https://dlthub.com/docs/general-usage/resource) + + This data resource yields data typical of what many web analytics and + tracking tools can collect. However, the specifics of what data is collected + and how it's used can vary significantly among different tracking services. + + Let's examine a synthetic dataset created for this article. It includes: + + `user_id`: Web trackers typically assign unique ID to users for + tracking their journeys and interactions over time. + + `device_name`: User device information helps in understanding the user base's device. + + `page_refer`: The referer URL is tracked to analyze traffic sources and user navigation behavior. + + Here's the resource that yields the sample data as discussed above: + + ```python + import dlt + + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. + + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. + + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ + + # Sample data representing tracked user data + sample_data = [ + {"user_id": 1, "device_name": "Sony Experia XZ", "page_referer": + "https://b2venture.lightning.force.com/"}, + {"user_id": 2, "device_name": "Samsung Galaxy S23 Ultra 5G", + "page_referer": "https://techcrunch.com/2023/07/20/can-dlthub-solve-the-python-library-problem-for-ai-dig-ventures-thinks-so/"}, + {"user_id": 3, "device_name": "Apple iPhone 14 Pro Max", + "page_referer": "https://dlthub.com/success-stories/freelancers-perspective/"}, + {"user_id": 4, "device_name": "OnePlus 11R", + "page_referer": "https://www.reddit.com/r/dataengineering/comments/173kp9o/ideas_for_data_validation_on_data_ingestion/"}, + {"user_id": 5, "device_name": "Google Pixel 7 Pro", "page_referer": "https://pypi.org/"}, + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data + ``` + +### 2. Create `fetch_average_price` function + +This particular function retrieves the average price of a device by utilizing SerpAPI and Google +shopping listings. To filter the data, the function uses `dlt` state, and only fetches prices +from SerpAPI for devices that have not been updated in the most recent run or for those that were +loaded more than 180 days in the past. + +The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the API token key. + +1. In the `.dlt`folder, there's a file called `secrets.toml`. It's where you store sensitive + information securely, like access tokens. Keep this file safe. Here's its format for service + account authentication: + + ```python + [sources] + api_key= "Please set me up!" #Serp Api key. + ``` + +1. Replace the value of the `api_key`. + +1. Create `fetch_average_price()` function as follows: + ```python + import datetime + import requests + + # Uncomment transformer function if it is to be used as a transformer, + # otherwise, it is being used with the `add_map` functionality. + + # @dlt.transformer(data_from=tracked_data) + def fetch_average_price(user_tracked_data): + """ + Fetches the average price of a device from an external API and + updates the user_data dictionary. + + This function retrieves the average price of a device specified in the + user_data dictionary by making an API request. The price data is cached + in the device_info state to reduce API calls. If the data for the device + is older than 180 days, a new API request is made. + + Args: + user_tracked_data (dict): A dictionary containing user data, including + the device name. + + Returns: + dict: The updated user_data dictionary with added device price and + updated timestamp. + """ + + # Retrieve the API key from dlt secrets + api_key = dlt.secrets.get("sources.api_key") + + # Get the current resource state for device information + device_info = dlt.current.resource_state().setdefault("devices", {}) + + # Current timestamp for checking the last update + current_timestamp = datetime.datetime.now() + + # Print the current device information + # print(device_info) # if you need to check state + + # Extract the device name from user data + device = user_tracked_data['device_name'] + device_data = device_info.get(device, {}) + + # Calculate the time since the last update + last_updated = ( + current_timestamp - + device_data.get('timestamp', datetime.datetime.min) + ) + # Check if the device is not in state or data is older than 180 days + if device not in device_info or last_updated > datetime.timedelta(days=180): + try: + # Make an API request to fetch device prices + response = requests.get("https://serpapi.com/search", params={ + "engine": "google_shopping", "q": device, + "api_key": api_key, "num": 10 + }) + except requests.RequestException as e: + print(f"Request failed: {e}") + return None + + if response.status_code != 200: + print(f"Failed to retrieve data: {response.status_code}") + return None + + # Process the response to extract prices + results = response.json().get("shopping_results", []) + prices = [] + for r in results: + if r.get("price"): + # Split the price string and convert each part to float + price = r.get("price") + price_parts = price.replace('$', '').replace(',', '').split() + for part in price_parts: + try: + prices.append(float(part)) + except ValueError: + pass # Ignore parts that can't be converted to float + + # Calculate the average price and update the device_info + device_price = round(sum(prices) / len(prices), 2) if prices else None + device_info[device] = { + 'timestamp': current_timestamp, + 'price': device_price + } + + # Add the device price and timestamp to the user data + user_tracked_data['device_price_USD'] = device_price + user_tracked_data['price_updated_at'] = current_timestamp + + else: + # Use cached price data if available and not outdated + user_tracked_data['device_price_USD'] = device_data.get('price') + user_tracked_data['price_updated_at'] = device_data.get('timestamp') + + return user_tracked_data + ``` + +### 3. Create your pipeline + +1. In creating the pipeline, the `fetch_average_price` can be used in the following ways: + - Add map function + - Transformer function + + + The `dlt` library's `transformer` and `add_map` functions serve distinct purposes in data + processing. + + `Transformers` used to process a resource and are ideal for post-load data transformations in a + pipeline, compatible with tools like `dbt`, the `dlt SQL client`, or Pandas for intricate data + manipulation. To read more: + [Click here.](../../general-usage/resource#process-resources-with-dlttransformer) + + Conversely, `add_map` used to customize a resource applies transformations at an item level + within a resource. It's useful for tasks like anonymizing individual data records. More on this + can be found under + [Customize resources](../../general-usage/resource#customize-resources) in the + documentation. + + +1. Here, we create the pipeline and use the `add_map` functionality: + + ```python + # Create the pipeline + pipeline = dlt.pipeline( + pipeline_name="data_enrichment_one", + destination="duckdb", + dataset_name="user_device_enrichment", + ) + + # Run the pipeline with the transformed source + load_info = pipeline.run(tracked_data.add_map(fetch_average_price)) + + print(load_info) + ``` + + :::info + Please note that the same outcome can be achieved by using the transformer function. To + do so, you need to add the transformer decorator at the top of the `fetch_average_price` function. + For `pipeline.run`, you can use the following code: + + ```python + # using fetch_average_price as a transformer function + load_info = pipeline.run( + tracked_data | fetch_average_price, + table_name="tracked_data" + ) + ``` + + This will execute the `fetch_average_price` function with the tracked data and return the average + price. + ::: + +### Run the pipeline + +1. Install necessary dependencies for the preferred + [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: + + ``` + pip install dlt[duckdb] + ``` + +1. Run the pipeline with the following command: + + ``` + python device_enrichment_pipeline.py + ``` + +1. To ensure that everything loads as expected, use the command: + + ``` + dlt pipeline show + ``` + + For example, the "pipeline_name" for the above pipeline example is `data_enrichment_one`; you can use + any custom name instead. + + diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index be3f8d8296..c20aa62d16 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -27,6 +27,7 @@ Above we want to use **filesystem** built-in destination. You can use shorthand ```py import dlt + pipeline = dlt.pipeline("pipeline", destination="dlt.destinations.filesystem") ``` @@ -37,6 +38,7 @@ Above we use built in **filesystem** destination by providing a class type `file ```py import dlt from dlt.destinations import filesystem + pipeline = dlt.pipeline("pipeline", destination=filesystem) ``` @@ -50,6 +52,7 @@ You can instantiate **destination class** yourself to configure it explicitly. W ```py import dlt + azure_bucket = filesystem("az://dlt-azure-bucket", destination_name="production_az_bucket") pipeline = dlt.pipeline("pipeline", destination=azure_bucket) ``` @@ -99,7 +102,10 @@ import dlt from dlt.destinations import postgres # pass full credentials - together with the password (not recommended) -pipeline = dlt.pipeline("pipeline", destination=postgres(credentials="postgresql://loader:loader@localhost:5432/dlt_data")) +pipeline = dlt.pipeline( + "pipeline", + destination=postgres(credentials="postgresql://loader:loader@localhost:5432/dlt_data"), +) ``` @@ -126,7 +132,9 @@ from dlt.sources.credentials import AzureCredentials credentials = AzureCredentials() # fill only the account name, leave key to be taken from secrets credentials.azure_storage_account_name = "production_storage" -pipeline = dlt.pipeline("pipeline", destination=filesystem("az://dlt-azure-bucket", credentials=credentials)) +pipeline = dlt.pipeline( + "pipeline", destination=filesystem("az://dlt-azure-bucket", credentials=credentials) +) ``` diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 92fdf064fd..4651d156f0 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -67,6 +67,4 @@ opportunities, you should use this strategy. The `staging-optimized` strategy be recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. This is a low cost and fast way to create a second independent table from the data of another. Learn more about [table cloning on snowflake](https://docs.snowflake.com/en/user-guide/object-clone). -For all other destinations the `staging-optimized` will fall back to the behavior of the `insert-from-staging` strategy. - - +For all other [destinations](../dlt-ecosystem/destinations/index.md), please look at their respective documentation pages to see if and how the `staging-optimized` strategy is implemented. If it is not implemented, `dlt` will fall back to the `insert-from-staging` strategy. diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index bd3bd4733a..09b8ca7a96 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -145,20 +145,19 @@ In example above we enforce the root key propagation with `fb_ads.root_key = Tru that correct data is propagated on initial `replace` load so the future `merge` load can be executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`. -## Incremental loading with last value +## Incremental loading with a cursor field -In most of the APIs (and other data sources i.e. database tables) you can request only new or updated -data by passing a timestamp or id of the last record to a query. The API/database returns just the -new/updated records from which you take "last value" timestamp/id for the next load. +In most of the REST APIs (and other data sources i.e. database tables) you can request new or updated +data by passing a timestamp or id of the "last" record to a query. The API/database returns just the +new/updated records from which you take maximum/minimum timestamp/id for the next load. To do incremental loading this way, we need to -- figure which data element is used to get new/updated records (e.g. “last value”, “last updated - at”, etc.); -- request the new part only (how we do this depends on the source API). +- figure which field is used to track changes (the so called **cursor field**) (e.g. “inserted_at”, "updated_at”, etc.); +- how to past the "last" (maximum/minimum) value of cursor field to an API to get just new / modified data (how we do this depends on the source API). -Once you've figured that out, `dlt` takes care of the loading of the incremental, removing -duplicates and managing the state with last values. Take a look at GitHub example below, where we +Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor field values, removing +duplicates and managing the state with last values of cursor. Take a look at GitHub example below, where we request recently created issues. ```python @@ -166,36 +165,38 @@ request recently created issues. def repo_issues( access_token, repository, - created_at = dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z") + updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - # get issues since "created_at" stored in state on previous run (or initial_value on first run) - for page in _get_issues_page(access_token, repository, since=created_at.start_value): + # get issues since "updated_at" stored in state on previous run (or initial_value on first run) + for page in _get_issues_page(access_token, repository, since=updated_at.start_value): yield page # last_value is updated after every page - print(created_at.last_value) + print(updated_at.last_value) ``` -Here we add `created_at` argument that will receive incremental state, initialized to -`1970-01-01T00:00:00Z`. It is configured to track `created_at` field in issues returned by -`_get_issues_page` and then yielded. It will store the newest `created_at` value in `dlt` -[state](state.md) and make it available in `created_at.start_value` on next pipeline -run. This value is used to request only issues newer (or equal) via GitHub API. +Here we add `updated_at` argument that will receive incremental state, initialized to +`1970-01-01T00:00:00Z`. It is configured to track `updated_at` field in issues yielded by +`repo_issues` resource. It will store the newest `updated_at` value in `dlt` +[state](state.md) and make it available in `updated_at.start_value` on next pipeline +run. This value is inserted in `_get_issues_page` function into request query param **since** to [Github API](https://docs.github.com/en/rest/issues/issues?#list-repository-issues) In essence, `dlt.sources.incremental` instance above -* **created_at.initial_value** which is always equal to "1970-01-01T00:00:00Z" passed in constructor -* **created_at.start_value** a maximum `created_at` value from the previous run or the **initial_value** on first run -* **created_at.last_value** a "real time" `created_at` value updated with each yielded item or page. before first yield it equals **start_value** -* **created_at.end_value** (here not used) [marking end of backfill range](#using-dltsourcesincremental-for-backfill) +* **updated_at.initial_value** which is always equal to "1970-01-01T00:00:00Z" passed in constructor +* **updated_at.start_value** a maximum `updated_at` value from the previous run or the **initial_value** on first run +* **updated_at.last_value** a "real time" `updated_at` value updated with each yielded item or page. before first yield it equals **start_value** +* **updated_at.end_value** (here not used) [marking end of backfill range](#using-dltsourcesincremental-for-backfill) When paginating you probably need **start_value** which does not change during the execution of the resource, however most paginators will return a **next page** link which you should use. Behind the scenes, `dlt` will deduplicate the results ie. in case the last issue is returned again -(`created_at` filter is inclusive) and skip already loaded ones. In the example below we +(`updated_at` filter is inclusive) and skip already loaded ones. + + +In the example below we incrementally load the GitHub events, where API does not let us filter for the newest events - it always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the duplicates and past issues. - ```python # use naming function in table name to generate separate tables for each event @dlt.resource(primary_key="id", table_name=lambda i: i['type']) # type: ignore @@ -289,7 +290,7 @@ def stripe(): yield data # create resources for several endpoints on a single decorator function - for endpoint in Endpoints: + for endpoint in endpoints: yield dlt.resource( get_resource, name=endpoint.value, @@ -301,13 +302,15 @@ def stripe(): Please note that in the example above, `get_resource` is passed as a function to `dlt.resource` to which we bind the endpoint: **dlt.resource(...)(endpoint)**. -> 🛑 The typical mistake is to pass a generator (not a function) as below: -> -> `yield dlt.resource(get_resource(endpoint), name=endpoint.value, write_disposition="merge", primary_key="id")`. -> -> Here we call **get_resource(endpoint)** and that creates un-evaluated generator on which resource -> is created. That prevents `dlt` from controlling the **created** argument during runtime and will -> result in `IncrementalUnboundError` exception. +:::caution +The typical mistake is to pass a generator (not a function) as below: + +`yield dlt.resource(get_resource(endpoint), name=endpoint.value, write_disposition="merge", primary_key="id")`. + +Here we call **get_resource(endpoint)** and that creates un-evaluated generator on which resource +is created. That prevents `dlt` from controlling the **created** argument during runtime and will +result in `IncrementalUnboundError` exception. +::: ### Using `dlt.sources.incremental` for backfill You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example: @@ -457,10 +460,13 @@ when using `min()` "higher" and "lower" are inverted. You can use these flags when both: 1. The source does **not** offer start/end filtering of results (e.g. there is no `start_time/end_time` query parameter or similar) -2. The source returns results ordered by the cursor field +2. The source returns results **ordered by the cursor field** -**Note**: These flags should not be used for unordered sources, e.g. if an API returns results both higher and lower +:::caution +If you use those flags, **make sure that the data source returns record ordered** (ascending / descending) on the cursor field, +e.g. if an API returns results both higher and lower than the given `end_value` in no particular order, the `end_out_of_range` flag can be `True` but you'll still want to keep loading. +::: The github events example above demonstrates how to use `start_out_of_range` as a stop condition. This approach works in any case where the API returns items in descending order and we're incrementally loading newer data. diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 4850027f24..095e03e96d 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -7,9 +7,9 @@ keywords: [pipeline, source, full refresh] # Pipeline A [pipeline](glossary.md#pipeline) is a connection that moves the data from your Python code to a -[destination](glossary.md#destination). Typically, you pass the `dlt` [sources](source.md) or -[resources](resource.md) to the pipeline. You can also pass generators, lists and other iterables to -it. When the pipeline runs, the resources get executed and the data is loaded at destination. +[destination](glossary.md#destination). The pipeline accepts `dlt` [sources](source.md) or +[resources](resource.md) as well as generators, async generators, lists and any iterables. +Once the pipeline runs, all resources get evaluated and the data is loaded at destination. Example: diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 96b10bdc86..3b08a0b8ab 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -8,7 +8,7 @@ keywords: [resource, api endpoint, dlt.resource] ## Declare a resource -A [resource](glossary.md#resource) is a function that yields data. To create a +A [resource](glossary.md#resource) is a ([optionally async](https://dlthub.com/docs/reference/performance#parallelism)) function that yields data. To create a resource, we add the `@dlt.resource` decorator to that function. Commonly used arguments: @@ -136,7 +136,6 @@ behaviour of creating child tables for these fields. We do not support `RootModel` that validate simple types. You can add such validator yourself, see [data filtering section](#filter-transform-and-pivot-data). - ### Dispatch data to many tables You can load data to many tables from a single resource. The most common case is a stream of events @@ -227,7 +226,7 @@ pipeline.run(users(limit=100) | user_details) ### Declare a standalone resource A standalone resource is defined on a function that is top level in a module (not inner function) that accepts config and secrets values. Additionally if `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the -function decorated function and user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. +decorated function and user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. ```python @dlt.resource(standalone=True) def filesystem(bucket_url=dlt.config.value): @@ -339,6 +338,45 @@ tables = sql_database() tables.users.table_name = "other_users" ``` +### Adjust schema when you yield data + +You can set or update the table name, columns and other schema elements when your resource is executed and you already yield data. Such changes will be merged +with the existing schema in the same way `apply_hints` method above works. There are many reason to adjust schema at runtime. For example when using Airflow, you +should avoid lengthy operations (ie. reflecting database tables) during creation of the DAG so it is better do do it when DAG executes. You may also emit partial +hints (ie. precision and scale for decimal types) for column to help `dlt` type inference. + +```python +@dlt.resource +def sql_table(credentials, schema, table): + # create sql alchemy engine + engine = engine_from_credentials(credentials) + engine.execution_options(stream_results=True) + metadata = MetaData(schema=schema) + # reflect the table schema + table_obj = Table(table, metadata, autoload_with=engine) + + for idx, batch in enumerate(table_rows(engine, table_obj)): + if idx == 0: + # emit first row with hints, table_to_columns and get_primary_key are helpers that extract dlt schema from + # SqlAlchemy model + yield dlt.mark.with_hints( + batch, + dlt.mark.make_hints(columns=table_to_columns(table_obj), primary_key=get_primary_key(table_obj)), + ) + else: + # just yield all the other rows + yield batch + +``` + +In the example above we use `dlt.mark.with_hints` and `dlt.mark.make_hints` to emit columns and primary key with the first extracted item. Table schema will +be adjusted after the `batch` is processed in the extract pipeline but before any schema contracts are applied and data is persisted in load package. + +:::tip +You can emit columns as Pydantic model and use dynamic hints (ie. lambda for table name) as well. You should avoid redefining `Incremental` this way. +::: + + ### Duplicate and rename resources There are cases when you your resources are generic (ie. bucket filesystem) and you want to load several instances of it (ie. files from different folders) to separate tables. In example below we use `filesystem` source to load csvs from two different folders into separate tables: ```python diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 13347b952b..7ce1d959c9 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -127,7 +127,46 @@ Postgres ignore it when creating tables. ### Variant columns Variant columns are generated by a normalizer when it encounters data item with type that cannot be -coerced in existing column. +coerced in existing column. Please see our [`coerce_row`](https://github.com/dlt-hub/dlt/blob/7d9baf1b8fdf2813bcf7f1afe5bb3558993305ca/dlt/common/schema/schema.py#L205) if you are interested to see how internally it works. + +Let's consider our [getting started](../getting-started#quick-start) example with slightly different approach, +where `id` is an integer type at the beginning + +```py +data = [ + {"id": 1, "human_name": "Alice"} +] +``` + +once pipeline runs we will have the following schema: + +| name | data_type | nullable | +| ------------- | ------------- | -------- | +| id | bigint | true | +| human_name | text | true | + +Now imagine the data has changed and `id` field also contains strings + +```py +data = [ + {"id": 1, "human_name": "Alice"} + {"id": "idx-nr-456", "human_name": "Bob"} +] +``` + +So after you run the pipeline `dlt` will automatically infer type changes and will add a new field in the schema `id__v_text` +to reflect that new data type for `id` so for any type which is not compatible with integer it will create a new field. + +| name | data_type | nullable | +| ------------- | ------------- | -------- | +| id | bigint | true | +| human_name | text | true | +| id__v_text | text | true | + +On the other hand if `id` field was already a string then introducing new data with `id` containing other types +will not change schema because they can be coerced to string. + +Now go ahead and try to add a new record where `id` is float number, you should see a new field `id__v_double` in the schema. ### Data types @@ -139,7 +178,7 @@ coerced in existing column. | timestamp | `'2023-07-26T14:45:00Z'`, `datetime.datetime.now()` | Supports precision expressed as parts of a second | | date | `datetime.date(2023, 7, 26)` | | | time | `'14:01:02'`, `datetime.time(14, 1, 2)` | Supports precision - see **timestamp** | -| bigint | `9876543210` | Support precision as number of bytes | +| bigint | `9876543210` | Supports precision as number of bits | | binary | `b'\x00\x01\x02\x03'` | Supports precision, like **text** | | complex | `[4, 5, 6]`, `{'a': 1}` | | | decimal | `Decimal('4.56')` | Supports precision and scale | @@ -160,7 +199,7 @@ do not support precision for a given data type will ignore it. The precision for **timestamp** is useful when creating **parquet** files. Use 3 - for milliseconds, 6 for microseconds, 9 for nanoseconds -The precision for **bigint** is mapped to available integer types ie. TINYINT, INT, BIGINT. The default is 8 bytes precision (BIGINT) +The precision for **bigint** is mapped to available integer types ie. TINYINT, INT, BIGINT. The default is 64 bits (8 bytes) precision (BIGINT) ::: ## Schema settings @@ -232,7 +271,7 @@ settings: ## Export and import schema files -Please follow this [walkthrough](../walkthroughs/adjust-a-schema.md) to export and import `yaml` +Please follow the guide on [how to adjust a schema](../walkthroughs/adjust-a-schema.md) to export and import `yaml` schema files in your pipeline. ## Attaching schemas to sources diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index 1f520336df..17c87f6b5f 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -19,7 +19,7 @@ single API. The most common approach is to define it in a separate Python module ## Declare sources You declare source by decorating a function returning one or more resource with `dlt.source`. Our -[Create a pipeline](../walkthroughs/create-a-pipeline.md) walkthrough teaches you how to do that. +[Create a pipeline](../walkthroughs/create-a-pipeline.md) how to guide teaches you how to do that. ### Create resources dynamically diff --git a/docs/website/docs/getting-started-snippets.py b/docs/website/docs/getting-started-snippets.py index 618ba1a406..eb00df9986 100644 --- a/docs/website/docs/getting-started-snippets.py +++ b/docs/website/docs/getting-started-snippets.py @@ -263,66 +263,3 @@ def repo_events(last_created_at=dlt.sources.incremental("created_at")): # @@@DLT_SNIPPET_END table_dispatch assert_load_info(load_info) - - -def pdf_to_weaviate_snippet() -> None: - # @@@DLT_SNIPPET_START pdf_to_weaviate - import os - - import dlt - from dlt.destinations.impl.weaviate import weaviate_adapter - from PyPDF2 import PdfReader - - @dlt.resource(selected=False) - def list_files(folder_path: str): - folder_path = os.path.abspath(folder_path) - for filename in os.listdir(folder_path): - file_path = os.path.join(folder_path, filename) - yield { - "file_name": filename, - "file_path": file_path, - "mtime": os.path.getmtime(file_path), - } - - @dlt.transformer(primary_key="page_id", write_disposition="merge") - def pdf_to_text(file_item, separate_pages: bool = False): - if not separate_pages: - raise NotImplementedError() - # extract data from PDF page by page - reader = PdfReader(file_item["file_path"]) - for page_no in range(len(reader.pages)): - # add page content to file item - page_item = dict(file_item) - page_item["text"] = reader.pages[page_no].extract_text() - page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) - yield page_item - - pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") - - # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" - # (3) sends them to pdf_to_text transformer with pipe (|) operator - pdf_pipeline = list_files("assets/invoices").add_filter( - lambda item: item["file_name"].endswith(".pdf") - ) | pdf_to_text(separate_pages=True) - - # set the name of the destination table to receive pages - # NOTE: Weaviate, dlt's tables are mapped to classes - pdf_pipeline.table_name = "InvoiceText" - - # use weaviate_adapter to tell destination to vectorize "text" column - load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) - row_counts = pipeline.last_trace.last_normalize_info - print(row_counts) - print("------") - print(load_info) - # @@@DLT_SNIPPET_END pdf_to_weaviate - - assert_load_info(load_info) - - # @@@DLT_SNIPPET_START pdf_to_weaviate_read - import weaviate - - client = weaviate.Client("http://localhost:8080") - # get text of all the invoices in InvoiceText class we just created above - print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) - # @@@DLT_SNIPPET_END pdf_to_weaviate_read diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index d0f9f29e48..cd121b0ad5 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -1,18 +1,18 @@ --- -title: Getting Started +title: Getting started description: quick start with dlt keywords: [getting started, quick start, basic examples] --- import snippets from '!!raw-loader!./getting-started-snippets.py'; -# Getting Started +# Getting started ## Overview `dlt` is an open-source library that you can add to your Python scripts to load data from various and often messy data sources into well-structured, live datasets. -Below we give you a preview how you can get data from APIs, files, Python objects or -pandas dataframes and move it into a local or remote database, data lake or a vector data store. +This guide will show you how to start using `dlt` with a simple example: loading data +from a list of Python dictionaries into DuckDB. Let's get started! @@ -24,21 +24,21 @@ Install dlt using `pip`: pip install -U dlt ``` -Command above installs (or upgrades) library core, in example below we use `duckdb` as a destination so let's add it: +The command above installs (or upgrades) the library core, in the example below we use DuckDB as a destination so let's add a `duckdb` dependency: ```bash pip install "dlt[duckdb]" ``` :::tip -Use clean virtual environment for your experiments! Here are [detailed instructions](reference/installation). +Use a clean virtual environment for your experiments! Here are [detailed instructions](reference/installation). Make sure that your `dlt` version is **0.3.15** or above. Check it in the terminal with `dlt --version`. ::: ## Quick start -Let's load a list of Python objects (dictionaries) into `duckdb` and inspect the created dataset: +For starters, let's load a list of Python dictionaries into DuckDB and inspect the created dataset. Here is the code: ```py @@ -55,7 +55,13 @@ print(load_info) ``` -Save this python script with the name `quick_start_pipeline.py` and run the following command: +When you look at the code above, you can see that we: +1. Import the `dlt` library. +2. Define our data to load. +3. Create a pipeline that loads data into DuckDB. Here we also specify the `pipeline_name` and `dataset_name`. We'll use both in a moment. +4. Run the pipeline. + +Save this Python script with the name `quick_start_pipeline.py` and run the following command: ```bash python quick_start_pipeline.py @@ -71,12 +77,17 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs ``` `dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. -[Take a look at it using built-in Streamlit app](reference/command-line-interface#show-tables-and-data-in-the-destination): + +### Explore the data + +To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): ```bash dlt pipeline quick_start show ``` + **quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: + ```bash pip install streamlit ``` @@ -92,533 +103,17 @@ Streamlit Explore data. Schema and data for a test pipeline “quick_start”. Looking for source code of all the snippets? You can find and run them [from this repository](https://github.com/dlt-hub/dlt/blob/devel/docs/website/docs/getting-started-snippets.py). ::: -Learn more: -- What is a data pipeline? - [General usage: Pipeline.](general-usage/pipeline) -- [Walkthrough: Create a pipeline](walkthroughs/create-a-pipeline). -- [Walkthrough: Run a pipeline.](walkthroughs/run-a-pipeline) -- How to configure DuckDB? - [Destinations: DuckDB.](dlt-ecosystem/destinations/duckdb) -- [The full list of available destinations.](dlt-ecosystem/destinations/) -- [Exploring the data](dlt-ecosystem/visualizations/exploring-the-data). -- What happens after loading? - [Destination tables](general-usage/destination-tables). - -## Load your data - -### Load data from a variety of sources - -Use dlt to load practically any data you deal with in your Python scripts into a dataset. -The library will create/update tables, infer data types and deal with nested data automatically: - - - - - -```py -import dlt - -from dlt.common import json - -with open("./assets/json_file.json", "rb") as file: - data = json.load(file) - -pipeline = dlt.pipeline( - pipeline_name="from_json", - destination="duckdb", - dataset_name="mydata", -) - -# NOTE: test data that we load is just a dictionary so we enclose it in a list -# if your JSON contains a list of objects you do not need to do that -load_info = pipeline.run([data], table_name="json_data") - -print(load_info) -``` - - -We import **json** from `dlt` namespace. It defaults to `orjson`(otherwise `simplejson`). It can also encode date times, dates, dataclasses and few more data types. - - - - - Pass anything that you can load with Pandas to `dlt` - - -```py -import dlt -import pandas as pd - -owid_disasters_csv = "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" -df = pd.read_csv(owid_disasters_csv) -data = df.to_dict(orient="records") - -pipeline = dlt.pipeline( - pipeline_name="from_csv", - destination="duckdb", - dataset_name="mydata", -) -load_info = pipeline.run(data, table_name="natural_disasters") - -print(load_info) -``` - - - - - - -```py -import dlt -from dlt.sources.helpers import requests - -# url to request dlt-hub/dlt issues -url = "https://api.github.com/repos/dlt-hub/dlt/issues" -# make the request and check if succeeded -response = requests.get(url) -response.raise_for_status() - -pipeline = dlt.pipeline( - pipeline_name="from_api", - destination="duckdb", - dataset_name="github_data", -) -# the response contains a list of issues -load_info = pipeline.run(response.json(), table_name="issues") - -print(load_info) -``` - - - - - -:::tip -Use our verified [sql database source](dlt-ecosystem/verified-sources/sql_database) -to sync your databases with warehouses, data lakes, or vector stores. -::: - - -```py -import dlt -from sqlalchemy import create_engine - -# use any sql database supported by SQLAlchemy, below we use a public mysql instance to get data -# NOTE: you'll need to install pymysql with "pip install pymysql" -# NOTE: loading data from public mysql instance may take several seconds -engine = create_engine("mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam") -with engine.connect() as conn: - # select genome table, stream data in batches of 100 elements - rows = conn.execution_options(yield_per=100).exec_driver_sql( - "SELECT * FROM genome LIMIT 1000" - ) - - pipeline = dlt.pipeline( - pipeline_name="from_database", - destination="duckdb", - dataset_name="genome_data", - ) - - # here we convert the rows into dictionaries on the fly with a map function - load_info = pipeline.run(map(lambda row: dict(row._mapping), rows), table_name="genome") - -print(load_info) -``` - - -Install **pymysql** driver: -```sh -pip install pymysql -``` - -Have some fun and run this snippet [with progress bar enabled](walkthroughs/run-a-pipeline.md#2-see-the-progress-during-loading) (we like **enlighten** the best, **tqdm** works as well): -```bash -pip install enlighten -PROGRESS=enlighten python load_from_db.py -``` - - - - -### Append or replace your data - -Run any of the previous examples twice to notice that each time a copy of the data is added to your tables. -We call this load mode `append`. It is very useful when i.e. you have a new folder created daily with `json` file logs, and you want to ingest them. - -Perhaps this is not what you want to do in the examples above. -For example, if the CSV file is updated, how we can refresh it in the database? -One method is to tell `dlt` to replace the data in existing tables by using `write_disposition`: - - -```py -import dlt - -data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] - -pipeline = dlt.pipeline( - pipeline_name="replace_data", - destination="duckdb", - dataset_name="mydata", -) -load_info = pipeline.run(data, table_name="users", write_disposition="replace") - -print(load_info) -``` - - -Run this script twice to see that **users** table still contains only one copy of your data. - -:::tip -What if you added a new column to your CSV? -`dlt` will migrate your tables! -See the `replace` mode and table schema migration in action in our [Colab Demo](https://colab.research.google.com/drive/1H6HKFi-U1V4p0afVucw_Jzv1oiFbH2bu#scrollTo=e4y4sQ78P_OM). -::: - -Learn more: - -- [Full load - how to replace your data](general-usage/full-loading). -- [Append, replace and merge your tables](general-usage/incremental-loading). - - -## Declare loading behavior - -You can define the loading process by decorating Python functions with `@dlt.resource`. - -### Load only new data (incremental loading) - -We can improve the GitHub API example above and get only issues that were created since last load. -Instead of using `replace` write_disposition and downloading all issues each time the pipeline is run, we do the following: - - - -```py -import dlt -from dlt.sources.helpers import requests - -@dlt.resource(table_name="issues", write_disposition="append") -def get_issues( - created_at=dlt.sources.incremental("created_at", initial_value="1970-01-01T00:00:00Z") -): - # NOTE: we read only open issues to minimize number of calls to the API. There's a limit of ~50 calls for not authenticated Github users - url = "https://api.github.com/repos/dlt-hub/dlt/issues?per_page=100&sort=created&directions=desc&state=open" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # stop requesting pages if the last element was already older than initial value - # note: incremental will skip those items anyway, we just do not want to use the api limits - if created_at.start_out_of_range: - break - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - -pipeline = dlt.pipeline( - pipeline_name="github_issues_incremental", - destination="duckdb", - dataset_name="github_data_append", -) -load_info = pipeline.run(get_issues) -row_counts = pipeline.last_trace.last_normalize_info - -print(row_counts) -print("------") -print(load_info) -``` - - - -We request issues for dlt-hub/dlt repository ordered by **created_at** field (descending) and yield them page by page in `get_issues` generator function. - -We use the `@dlt.resource` decorator to declare table name to which data will be loaded and write disposition, which is `append`. - -We also use `dlt.sources.incremental` to track `created_at` field present in each issue to filter in the newly created. - -Now run the script. It loads all the issues from our repo to `duckdb`. Run it again, and you can see that no issues got added (if no issues were created in the meantime). - -Now you can run this script on a daily schedule and each day you’ll load only issues created after the time of the previous pipeline run. - -:::tip -Between pipeline runs, `dlt` keeps the state in the same database it loaded data to. -Peek into that state, the tables loaded and get other information with: - -```shell -dlt pipeline -v github_issues_incremental info -``` -::: - -Learn more: - -- Declare your [resources](general-usage/resource) and group them in [sources](general-usage/source) using Python decorators. -- [Set up "last value" incremental loading.](general-usage/incremental-loading#incremental_loading-with-last-value) -- [Inspect pipeline after loading.](walkthroughs/run-a-pipeline#4-inspect-a-load-process) -- [`dlt` command line interface.](reference/command-line-interface) - -### Update and deduplicate your data +## What's next? -The script above finds **new** issues and adds them to the database. -It will ignore any updates to **existing** issue text, emoji reactions etc. -To get always fresh content of all the issues you combine incremental load with `merge` write disposition, -like in the script below. +Now that you have a basic understanding of how to get started with dlt, you might be eager to dive deeper. A great next step is to walk through our detailed tutorial, where we provide a step-by-step guide to building a pipeline that loads data from the GitHub API into DuckDB and teaches you how to use some of the most important features of dlt. - -```py -import dlt -from dlt.sources.helpers import requests - -@dlt.resource( - table_name="issues", - write_disposition="merge", - primary_key="id", -) -def get_issues( - updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") -): - # NOTE: we read only open issues to minimize number of calls to the API. There's a limit of ~50 calls for not authenticated Github users - url = f"https://api.github.com/repos/dlt-hub/dlt/issues?since={updated_at.last_value}&per_page=100&sort=updated&directions=desc&state=open" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] +[Follow the tutorial →](tutorial/intro) -pipeline = dlt.pipeline( - pipeline_name="github_issues_merge", - destination="duckdb", - dataset_name="github_data_merge", -) -load_info = pipeline.run(get_issues) -row_counts = pipeline.last_trace.last_normalize_info - -print(row_counts) -print("------") -print(load_info) -``` - - -Above we add `primary_key` hint that tells `dlt` how to identify the issues in the database to find duplicates which content it will merge. - -Note that we now track the `updated_at` field - so we filter in all issues **updated** since the last pipeline run (which also includes those newly created). - -Pay attention how we use **since** parameter from [GitHub API](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) -and `updated_at.last_value` to tell GitHub to return issues updated only **after** the date we pass. `updated_at.last_value` holds the last `updated_at` value from the previous run. - -Learn more: - -- [You can do way more with merge.](general-usage/incremental-loading#merge-incremental_loading) - -### Dispatch stream of events to tables by event type - -This is a fun but practical example that reads GitHub events from **dlt** repository (such as issue or pull request created, comment added etc.). -Each event type is sent to a different table in `duckdb`. - - -```py -import dlt -from dlt.sources.helpers import requests - -@dlt.resource(primary_key="id", table_name=lambda i: i["type"], write_disposition="append") -def repo_events(last_created_at=dlt.sources.incremental("created_at")): - url = "https://api.github.com/repos/dlt-hub/dlt/events?per_page=100" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # stop requesting pages if the last element was already older than initial value - # note: incremental will skip those items anyway, we just do not want to use the api limits - if last_created_at.start_out_of_range: - break - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - -pipeline = dlt.pipeline( - pipeline_name="github_events", - destination="duckdb", - dataset_name="github_events_data", -) -load_info = pipeline.run(repo_events) -row_counts = pipeline.last_trace.last_normalize_info - -print(row_counts) -print("------") -print(load_info) -``` - - -Events content never changes so we can use `append` write disposition and track new events using `created_at` field. - -We name the tables using a function that receives an event data and returns table name: `table_name=lambda i: i["type"]` - -Now run the script: - -```shell -python github_events_dispatch.py -``` - -Peek at created tables: - -```shell -dlt pipeline -v github_events info -dlt pipeline github_events trace -``` - -And preview the data: - -```shell -dlt pipeline -v github_events show -``` - -:::tip -Some of the events produce tables with really many child tables. You can [control the level of table nesting](general-usage/source.md#reduce-the-nesting-level-of-generated-tables) with a decorator. - - -Another fun [Colab Demo](https://colab.research.google.com/drive/1BXvma_9R9MX8p_iSvHE4ebg90sUroty2#scrollTo=a3OcZolbaWGf) - we analyze reactions on duckdb repo! - -::: - -Learn more: -* [Change nesting of the tables](general-usage/source.md#reduce-the-nesting-level-of-generated-tables) with a decorator. - -### Transform the data before the load - -Below we extract text from PDFs and load it to [Weaviate](dlt-ecosystem/destinations/weaviate) vector store. - - -```py -import os - -import dlt -from dlt.destinations.impl.weaviate import weaviate_adapter -from PyPDF2 import PdfReader - -@dlt.resource(selected=False) -def list_files(folder_path: str): - folder_path = os.path.abspath(folder_path) - for filename in os.listdir(folder_path): - file_path = os.path.join(folder_path, filename) - yield { - "file_name": filename, - "file_path": file_path, - "mtime": os.path.getmtime(file_path), - } - -@dlt.transformer(primary_key="page_id", write_disposition="merge") -def pdf_to_text(file_item, separate_pages: bool = False): - if not separate_pages: - raise NotImplementedError() - # extract data from PDF page by page - reader = PdfReader(file_item["file_path"]) - for page_no in range(len(reader.pages)): - # add page content to file item - page_item = dict(file_item) - page_item["text"] = reader.pages[page_no].extract_text() - page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) - yield page_item - -pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") - -# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" -# (3) sends them to pdf_to_text transformer with pipe (|) operator -pdf_pipeline = list_files("assets/invoices").add_filter( - lambda item: item["file_name"].endswith(".pdf") -) | pdf_to_text(separate_pages=True) - -# set the name of the destination table to receive pages -# NOTE: Weaviate, dlt's tables are mapped to classes -pdf_pipeline.table_name = "InvoiceText" - -# use weaviate_adapter to tell destination to vectorize "text" column -load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) -row_counts = pipeline.last_trace.last_normalize_info -print(row_counts) -print("------") -print(load_info) -``` - - -We start with a simple resource that lists files in specified folder. To that we add a **filter** function that removes all files that are not pdfs. - -To parse PDFs we use [PyPDF](https://pypdf2.readthedocs.io/en/3.0.0/user/extract-text.html) and return each page from a given PDF as separate data item. - -Parsing happens in `@dlt.transformer` which receives data from `list_files` resource. It splits PDF into pages, extracts text and yields pages separately -so each PDF will correspond to many items in Weaviate `InvoiceText` class. We set the primary key and use merge disposition so if the same PDF comes twice -we'll just update the vectors, and not duplicate. - -Look how we pipe data from `list_files` resource (note that resource is deselected so we do not load raw file items to destination) into `pdf_to_text` using **|** operator. - -Just before load, the `weaviate_adapter` is used to tell `weaviate` destination which fields to vectorize. - -To run this example you need additional dependencies: - -```shell -pip install PyPDF2 "dlt[weaviate]" -python pdf_to_weaviate.py -``` - -Now it is time to query our documents. - -```py -import weaviate - -client = weaviate.Client("http://localhost:8080") -# get text of all the invoices in InvoiceText class we just created above -print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) -``` - - -Above we provide URL to local cluster. We also use `contextionary` to vectorize data. You may find information on our setup in links below. - -:::tip - -Change the destination to `duckdb` if you do not have access to Weaviate cluster or not able to run it locally. - -::: - -Learn more: - -- [Setup Weaviate destination - local or cluster](dlt-ecosystem/destinations/weaviate.md). -- [Connect the transformers to the resources](general-usage/resource#feeding-data-from-one-resource-into-another) -to load additional data or enrich it. -- [Transform your data before loading](general-usage/resource#customize-resources) and see some - [examples of customizations like column renames and anonymization](general-usage/customising-pipelines/renaming_columns). - -## Next steps - -If you want to take full advantage of the `dlt` library, then we strongly suggest that you build your sources out of existing **building blocks:** - -- Pick your [destinations](dlt-ecosystem/destinations/). -- Check [verified sources](dlt-ecosystem/verified-sources/) provided by us and community. -- Access your data with [SQL](dlt-ecosystem/transformations/sql) or [Pandas](dlt-ecosystem/transformations/sql). -- Declare your [resources](general-usage/resource) and group them in [sources](general-usage/source) using Python decorators. -- [Connect the transformers to the resources](general-usage/resource#feeding-data-from-one-resource-into-another) to load additional data or enrich it. -- [Create your resources dynamically from data](general-usage/source#create-resources-dynamically). -- [Append, replace and merge your tables](general-usage/incremental-loading). -- [Transform your data before loading](general-usage/resource#customize-resources) and see some [examples of customizations like column renames and anonymization](general-usage/customising-pipelines/renaming_columns). -- [Set up "last value" incremental loading](general-usage/incremental-loading#incremental_loading-with-last-value). -- [Set primary and merge keys, define the columns nullability and data types](general-usage/resource#define-schema). -- [Pass config and credentials into your sources and resources](general-usage/credentials). -- [Use built-in requests client](reference/performance#using-the-built-in-requests-client). -- [Run in production: inspecting, tracing, retry policies and cleaning up](running-in-production/running). -- [Run resources in parallel, optimize buffers and local storage](reference/performance.md) +More resources: +- [What is a data pipeline in dlt?](general-usage/pipeline) +- [How to create a pipeline](walkthroughs/create-a-pipeline) +- [How to run a pipeline.](walkthroughs/run-a-pipeline) +- [How to configure DuckDB](dlt-ecosystem/destinations/duckdb) +- [The full list of available destinations](dlt-ecosystem/destinations/) +- [Exploring the data](dlt-ecosystem/visualizations/exploring-the-data). +- [Destination tables: what happens after loading?](general-usage/destination-tables) \ No newline at end of file diff --git a/docs/website/docs/intro-snippets.py b/docs/website/docs/intro-snippets.py index bef758d0aa..f270dcee6e 100644 --- a/docs/website/docs/intro-snippets.py +++ b/docs/website/docs/intro-snippets.py @@ -2,7 +2,7 @@ def intro_snippet() -> None: - # @@@DLT_SNIPPET_START index + # @@@DLT_SNIPPET_START api import dlt from dlt.sources.helpers import requests @@ -19,6 +19,63 @@ def intro_snippet() -> None: data.append(response.json()) # Extract, normalize, and load the data load_info = pipeline.run(data, table_name="player") - # @@@DLT_SNIPPET_END index + # @@@DLT_SNIPPET_END api + + assert_load_info(load_info) + + +def csv_snippet() -> None: + # @@@DLT_SNIPPET_START csv + import dlt + import pandas as pd + + owid_disasters_csv = ( + "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" + ) + df = pd.read_csv(owid_disasters_csv) + data = df.to_dict(orient="records") + + pipeline = dlt.pipeline( + pipeline_name="from_csv", + destination="duckdb", + dataset_name="mydata", + ) + load_info = pipeline.run(data, table_name="natural_disasters") + + print(load_info) + # @@@DLT_SNIPPET_END csv + + assert_load_info(load_info) + + +def db_snippet() -> None: + # @@@DLT_SNIPPET_START db + import dlt + from sqlalchemy import create_engine + + # Use any SQL database supported by SQLAlchemy, below we use a public + # MySQL instance to get data. + # NOTE: you'll need to install pymysql with `pip install pymysql` + # NOTE: loading data from public mysql instance may take several seconds + engine = create_engine("mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam") + + with engine.connect() as conn: + # Select genome table, stream data in batches of 100 elements + query = "SELECT * FROM genome LIMIT 1000" + rows = conn.execution_options(yield_per=100).exec_driver_sql(query) + + pipeline = dlt.pipeline( + pipeline_name="from_database", + destination="duckdb", + dataset_name="genome_data", + ) + + # Convert the rows into dictionaries on the fly with a map function + load_info = pipeline.run(map(lambda row: dict(row._mapping), rows), table_name="genome") + + print(load_info) + # @@@DLT_SNIPPET_END db assert_load_info(load_info) diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 8dca0f2edb..37f03544d0 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -8,20 +8,31 @@ import snippets from '!!raw-loader!./intro-snippets.py'; # Introduction - ![dlt pacman](/img/dlt-pacman.gif) ## What is `dlt`? + `dlt` is an open-source library that you can add to your Python scripts to load data -from various and often messy data sources into well-structured, live datasets. Install it with: +from various and often messy data sources into well-structured, live datasets. To get started, install it with: ```sh pip install dlt ``` -There's no need to start any backends or containers. Simply import `dlt` in a Python file or Jupyter Notebook cell, and write a simple pipeline like this: - - - - +Unlike other solutions, with dlt, there's no need to use any backends or containers. Simply import `dlt` in a Python file or a Jupyter Notebook cell, and create a pipeline to load data into any of the [supported destinations](dlt-ecosystem/destinations/). You can load data from any source that produces Python data structures, including APIs, files, databases, and more. + +The library will create or update tables, infer data types and handle nested data automatically. Here are a few example pipelines: + + + + + ```py import dlt from dlt.sources.helpers import requests @@ -40,24 +51,130 @@ for player in ["magnuscarlsen", "rpragchess"]: # Extract, normalize, and load the data load_info = pipeline.run(data, table_name="player") ``` - + -If you don't have `duckdb`, you can install it as an extra: +Copy this example to a file or a Jupyter Notebook and run it. To make it work with the DuckDB destination, you'll need to install the **duckdb** dependency (the default `dlt` installation is really minimal): ```sh pip install "dlt[duckdb]" ``` Now **run** your Python file or Notebook cell. -What that code does: +How it works? The library extracts data from a [source](general-usage/glossary.md#source) (here: **chess.com REST API**), inspects its structure to create a +[schema](general-usage/glossary.md#schema), structures, normalizes and verifies the data, and then +loads it into a [destination](general-usage/glossary.md#destination) (here: **duckdb**, into a database schema **player_data** and table name **player**). + + + + + + +Initialize the [Slack source](dlt-ecosystem/verified-sources/slack) with `dlt init` command: + +```sh +dlt init slack duckdb +``` + +Create and run a pipeline: + +```py +import dlt + +from slack import slack_source + +pipeline = dlt.pipeline( + pipeline_name="slack", + destination="duckdb", + dataset_name="slack_data" +) + +source = slack_source( + start_date=datetime(2023, 9, 1), + end_date=datetime(2023, 9, 8), + page_size=100, +) + +load_info = pipeline.run(source) +print(load_info) +``` + + + + + Pass anything that you can load with Pandas to `dlt` -1. Extracts data from a -[source](general-usage/glossary.md#source) (here: **chess.com REST API**). -2. `dlt` inspects the data's structure to create a -[schema](general-usage/glossary.md#schema). -3. `dlt` structures, normalizes and verifies the data. -4. `dlt` loads data into a [destination](general-usage/glossary.md#destination) (here: a **duckdb** database schema **player_data** and table name **player**). + +```py +import dlt +import pandas as pd + +owid_disasters_csv = ( + "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" +) +df = pd.read_csv(owid_disasters_csv) +data = df.to_dict(orient="records") + +pipeline = dlt.pipeline( + pipeline_name="from_csv", + destination="duckdb", + dataset_name="mydata", +) +load_info = pipeline.run(data, table_name="natural_disasters") + +print(load_info) +``` + + + + + +:::tip +Use our verified [SQL database source](dlt-ecosystem/verified-sources/sql_database) +to sync your databases with warehouses, data lakes, or vector stores. +::: + + +```py +import dlt +from sqlalchemy import create_engine + +# Use any SQL database supported by SQLAlchemy, below we use a public +# MySQL instance to get data. +# NOTE: you'll need to install pymysql with `pip install pymysql` +# NOTE: loading data from public mysql instance may take several seconds +engine = create_engine( + "mysql+pymysql://anonymous@ensembldb.ensembl.org:3306/acanthochromis_polyacanthus_core_100_1" +) + +with engine.connect() as conn: + # Select genome table, stream data in batches of 100 elements + query = "SELECT * FROM analysis LIMIT 1000" + rows = conn.execution_options(yield_per=100).exec_driver_sql(query) + + pipeline = dlt.pipeline( + pipeline_name="from_database", + destination="duckdb", + dataset_name="acanthochromis_polyacanthus_data", + ) + + # Convert the rows into dictionaries on the fly with a map function + load_info = pipeline.run( + map(lambda row: dict(row._mapping), rows), table_name="acanthochromis_polyacanthus" + ) + +print(load_info) +``` + + +Install **pymysql** driver: +```sh +pip install pymysql +``` + + + -See below for other easy ways to try `dlt`. ## Why use `dlt`? @@ -69,18 +186,17 @@ external APIs, backends or containers, scales on micro and large infra alike. while empowering senior professionals. ## Getting started with `dlt` - -1. Play with the +1. Dive into our [Getting started guide](getting-started.md) for a quick intro to the essentials of `dlt`. +2. Play with the [Google Colab demo](https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing). This is the simplest way to see `dlt` in action. -2. Run [Getting Started snippets](getting-started.md) and load data from python objects, files, data frames, databases, APIs or PDFs into any [destination](dlt-ecosystem/destinations/). -3. Read [Pipeline Tutorial](build-a-pipeline-tutorial.md) to start building E(t)LT pipelines from ready components. -4. We have many interesting [walkthroughs](walkthroughs/) where you create, run, customize and deploy pipelines. +3. Read the [Tutorial](tutorial/intro) to learn how to build a pipeline that loads data from an API. +4. Check out the [How-to guides](walkthroughs/) for recepies on common use cases for creating, running and deploying pipelines. 5. Ask us on [Slack](https://dlthub.com/community) if you have any questions about use cases or the library. -## Become part of the `dlt` community +## Join the `dlt` community 1. Give the library a ⭐ and check out the code on [GitHub](https://github.com/dlt-hub/dlt). 1. Ask questions and share how you use the library on diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index d774d5faa6..b37a3a118e 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -58,8 +58,7 @@ schedule into quotation marks as in the example above. For the chess.com API example above, you could deploy it with `dlt deploy chess.py github-action --schedule "*/30 * * * *"`. -Follow the [Deploy a pipeline with Github Actions](../walkthroughs/deploy-a-pipeline/deploy-with-github-actions) -walkthrough to learn more. +Follow the guide on [how to deploy a pipeline with Github Actions](../walkthroughs/deploy-a-pipeline/deploy-with-github-actions) to learn more. ### airflow-composer @@ -69,8 +68,7 @@ dlt deploy