Skip to content

Commit

Permalink
Merge pull request #987 from dlt-hub/d#/update_pipeline_links_script
Browse files Browse the repository at this point in the history
add update pipeline links script
  • Loading branch information
sh-rp authored Feb 21, 2024
2 parents 4ab54d0 + 4b118b0 commit b228ce6
Show file tree
Hide file tree
Showing 54 changed files with 308 additions and 49 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/deploy_docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: deploy docs

on:
schedule:
- cron: '0 2 * * *'
workflow_dispatch:

env:
NETLIFY_DOCS_PRODUCTION_DEPLOY_HOOK: ${{ secrets.NETLIFY_DOCS_PRODUCTION_DEPLOY_HOOK }}

jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Trigger deploy hook
run: curl ${{ env.NETLIFY_DOCS_PRODUCTION_DEPLOY_HOOK }} -X POST
12 changes: 7 additions & 5 deletions docs/examples/chess_production/chess.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from dlt.common.typing import StrAny, TDataItems
from dlt.sources.helpers.requests import client


@dlt.source
def chess(
chess_url: str = dlt.config.value,
Expand Down Expand Up @@ -60,7 +59,6 @@ def players_games(username: Any) -> Iterator[TDataItems]:

MAX_PLAYERS = 5


def load_data_with_retry(pipeline, data):
try:
for attempt in Retrying(
Expand All @@ -70,7 +68,9 @@ def load_data_with_retry(pipeline, data):
reraise=True,
):
with attempt:
logger.info(f"Running the pipeline, attempt={attempt.retry_state.attempt_number}")
logger.info(
f"Running the pipeline, attempt={attempt.retry_state.attempt_number}"
)
load_info = pipeline.run(data)
logger.info(str(load_info))

Expand All @@ -92,7 +92,9 @@ def load_data_with_retry(pipeline, data):
# print the information on the first load package and all jobs inside
logger.info(f"First load package info: {load_info.load_packages[0]}")
# print the information on the first completed job in first load package
logger.info(f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}")
logger.info(
f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}"
)

# check for schema updates:
schema_updates = [p.schema_update for p in load_info.load_packages]
Expand Down Expand Up @@ -150,4 +152,4 @@ def load_data_with_retry(pipeline, data):
)
# get data for a few famous players
data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS)
load_data_with_retry(pipeline, data)
load_data_with_retry(pipeline, data)
10 changes: 4 additions & 6 deletions docs/examples/connector_x_arrow/load_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import dlt
from dlt.sources.credentials import ConnectionStringCredentials


def read_sql_x(
conn_str: ConnectionStringCredentials = dlt.secrets.value,
query: str = dlt.config.value,
Expand All @@ -15,17 +14,16 @@ def read_sql_x(
protocol="binary",
)


def genome_resource():
# create genome resource with merge on `upid` primary key
genome = dlt.resource(
name="acanthochromis_polyacanthus",
name="genome",
write_disposition="merge",
primary_key="analysis_id",
primary_key="upid",
standalone=True,
)(read_sql_x)(
"mysql://[email protected]:3306/acanthochromis_polyacanthus_core_100_1", # type: ignore[arg-type]
"SELECT * FROM analysis LIMIT 20",
"mysql://[email protected]:4497/Rfam", # type: ignore[arg-type]
"SELECT * FROM genome ORDER BY created LIMIT 1000",
)
# add incremental on created at
genome.apply_hints(incremental=dlt.sources.incremental("created"))
Expand Down
5 changes: 1 addition & 4 deletions docs/examples/google_sheets/google_sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
)
from dlt.common.typing import DictStrAny, StrAny


def _initialize_sheets(
credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]
) -> Any:
# Build the service object.
service = build("sheets", "v4", credentials=credentials.to_native_credentials())
return service


@dlt.source
def google_spreadsheet(
spreadsheet_id: str,
Expand Down Expand Up @@ -57,7 +55,6 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
for name in sheet_names
]


if __name__ == "__main__":
pipeline = dlt.pipeline(destination="duckdb")
# see example.secrets.toml to where to put credentials
Expand All @@ -70,4 +67,4 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]:
sheet_names=range_names,
)
)
print(info)
print(info)
8 changes: 4 additions & 4 deletions docs/examples/incremental_loading/zendesk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from dlt.common.typing import TAnyDateTime
from dlt.sources.helpers.requests import client


@dlt.source(max_table_nesting=2)
def zendesk_support(
credentials: Dict[str, str] = dlt.secrets.value,
start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008
start_date: Optional[TAnyDateTime] = pendulum.datetime( # noqa: B008
year=2000, month=1, day=1
),
end_date: Optional[TAnyDateTime] = None,
):
"""
Expand Down Expand Up @@ -112,12 +113,11 @@ def get_pages(
if not response_json["end_of_stream"]:
get_url = response_json["next_page"]


if __name__ == "__main__":
# create dlt pipeline
pipeline = dlt.pipeline(
pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data"
)

load_info = pipeline.run(zendesk_support())
print(load_info)
print(load_info)
2 changes: 0 additions & 2 deletions docs/examples/nested_data/nested_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

CHUNK_SIZE = 10000


# You can limit how deep dlt goes when generating child tables.
# By default, the library will descend and generate child tables
# for all nested lists, without a limit.
Expand Down Expand Up @@ -82,7 +81,6 @@ def load_documents(self) -> Iterator[TDataItem]:
while docs_slice := list(islice(cursor, CHUNK_SIZE)):
yield map_nested_in_place(convert_mongo_objs, docs_slice)


def convert_mongo_objs(value: Any) -> Any:
if isinstance(value, (ObjectId, Decimal128)):
return str(value)
Expand Down
5 changes: 1 addition & 4 deletions docs/examples/pdf_to_weaviate/pdf_to_weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from dlt.destinations.impl.weaviate import weaviate_adapter
from PyPDF2 import PdfReader


@dlt.resource(selected=False)
def list_files(folder_path: str):
folder_path = os.path.abspath(folder_path)
Expand All @@ -16,7 +15,6 @@ def list_files(folder_path: str):
"mtime": os.path.getmtime(file_path),
}


@dlt.transformer(primary_key="page_id", write_disposition="merge")
def pdf_to_text(file_item, separate_pages: bool = False):
if not separate_pages:
Expand All @@ -30,7 +28,6 @@ def pdf_to_text(file_item, separate_pages: bool = False):
page_item["page_id"] = file_item["file_name"] + "_" + str(page_no)
yield page_item


pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate")

# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
Expand All @@ -54,4 +51,4 @@ def pdf_to_text(file_item, separate_pages: bool = False):

client = weaviate.Client("http://localhost:8080")
# get text of all the invoices in InvoiceText class we just created above
print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
9 changes: 4 additions & 5 deletions docs/examples/qdrant_zendesk/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@

from dlt.common.configuration.inject import with_config


# function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
@dlt.source(max_table_nesting=2)
def zendesk_support(
credentials: Dict[str, str] = dlt.secrets.value,
start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008
start_date: Optional[TAnyDateTime] = pendulum.datetime( # noqa: B008
year=2000, month=1, day=1
),
end_date: Optional[TAnyDateTime] = None,
):
"""
Expand Down Expand Up @@ -79,15 +80,13 @@ def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]:
return None
return ensure_pendulum_datetime(value)


# modify dates to return datetime objects instead
def _fix_date(ticket):
ticket["updated_at"] = _parse_date_or_none(ticket["updated_at"])
ticket["created_at"] = _parse_date_or_none(ticket["created_at"])
ticket["due_at"] = _parse_date_or_none(ticket["due_at"])
return ticket


# function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk
def get_pages(
url: str,
Expand Down Expand Up @@ -128,7 +127,6 @@ def get_pages(
if not response_json["end_of_stream"]:
get_url = response_json["next_page"]


if __name__ == "__main__":
# create a pipeline with an appropriate name
pipeline = dlt.pipeline(
Expand All @@ -148,6 +146,7 @@ def get_pages(

print(load_info)


# running the Qdrant client to connect to your Qdrant database

@with_config(sections=("destination", "qdrant", "credentials"))
Expand Down
4 changes: 1 addition & 3 deletions docs/examples/transformers/pokemon.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import dlt
from dlt.sources.helpers import requests


@dlt.source(max_table_nesting=2)
def source(pokemon_api_url: str):
""""""
Expand Down Expand Up @@ -47,7 +46,6 @@ def species(pokemon_details):

return (pokemon_list | pokemon, pokemon_list | pokemon | species)


if __name__ == "__main__":
# build duck db pipeline
pipeline = dlt.pipeline(
Expand All @@ -56,4 +54,4 @@ def species(pokemon_details):

# the pokemon_list resource does not need to be loaded
load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
print(load_info)
print(load_info)
12 changes: 11 additions & 1 deletion docs/website/docs/dlt-ecosystem/destinations/athena.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,14 @@ aws_data_catalog="awsdatacatalog"
You can choose the following file formats:
* [parquet](../file-formats/parquet.md) is used by default

------
<!--@@@DLT_SNIPPET_START tuba::athena-->
## Additional Setup guides

- [Load data from Chess.com to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-athena)
- [Load data from Notion to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-athena)
- [Load data from HubSpot to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-athena)
- [Load data from GitHub to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-athena)
- [Load data from Google Analytics to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-athena)
- [Load data from Google Sheets to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-athena)
- [Load data from Stripe to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-athena)
<!--@@@DLT_SNIPPET_END tuba::athena-->
12 changes: 12 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/bigquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,15 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-b

### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination)

<!--@@@DLT_SNIPPET_START tuba::bigquery-->
## Additional Setup guides

- [Load data from Notion to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-bigquery)
- [Load data from Google Analytics to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-bigquery)
- [Load data from Chess.com to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-bigquery)
- [Load data from HubSpot to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-bigquery)
- [Load data from GitHub to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-bigquery)
- [Load data from Google Sheets to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-bigquery)
- [Load data from Stripe to BigQuery in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-bigquery)
<!--@@@DLT_SNIPPET_END tuba::bigquery-->
12 changes: 12 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/databricks.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,15 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d

### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination).

<!--@@@DLT_SNIPPET_START tuba::databricks-->
## Additional Setup guides

- [Load data from GitHub to Databricks in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-databricks)
- [Load data from Notion to Databricks in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-databricks)
- [Load data from Stripe to Databricks in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-databricks)
- [Load data from HubSpot to Databricks in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-databricks)
- [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks)
- [Load data from Google Sheets to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-databricks)
- [Load data from Chess.com to Databricks in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-databricks)
<!--@@@DLT_SNIPPET_END tuba::databricks-->
12 changes: 12 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/duckdb.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,15 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d

### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination)

<!--@@@DLT_SNIPPET_START tuba::duckdb-->
## Additional Setup guides

- [Load data from Google Analytics to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-duckdb)
- [Load data from Google Sheets to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-duckdb)
- [Load data from Stripe to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-duckdb)
- [Load data from Notion to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-duckdb)
- [Load data from Chess.com to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-duckdb)
- [Load data from HubSpot to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-duckdb)
- [Load data from GitHub to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-duckdb)
<!--@@@DLT_SNIPPET_END tuba::duckdb-->
3 changes: 3 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/filesystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,6 @@ You can choose the following file formats:
## Syncing of `dlt` state
This destination does not support restoring the `dlt` state. You can change that by requesting the [feature](https://github.com/dlt-hub/dlt/issues/new/choose) or contributing to the core library 😄
You can however easily [backup and restore the pipeline working folder](https://gist.github.com/rudolfix/ee6e16d8671f26ac4b9ffc915ad24b6e) - reusing the bucket and credentials used to store files.

<!--@@@DLT_SNIPPET_START tuba::filesystem-->
<!--@@@DLT_SNIPPET_END tuba::filesystem-->
3 changes: 3 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/motherduck.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,6 @@ My observation is that if you write a lot of data into the database then close t

### Invalid Input Error: Initialization function "motherduck_init" from file
Use `duckdb 0.8.1` or above.

<!--@@@DLT_SNIPPET_START tuba::motherduck-->
<!--@@@DLT_SNIPPET_END tuba::motherduck-->
11 changes: 11 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/mssql.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,14 @@ destination.mssql.credentials="mssql://loader:<password>@loader.database.windows
### dbt support
No dbt support yet

<!--@@@DLT_SNIPPET_START tuba::mssql-->
## Additional Setup guides

- [Load data from Stripe to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-mssql)
- [Load data from Google Analytics to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-mssql)
- [Load data from Google Sheets to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-mssql)
- [Load data from Chess.com to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-mssql)
- [Load data from GitHub to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-mssql)
- [Load data from Notion to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-mssql)
- [Load data from HubSpot to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-mssql)
<!--@@@DLT_SNIPPET_END tuba::mssql-->
12 changes: 12 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/postgres.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,15 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via dbt-po

### Syncing of `dlt` state
This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination)

<!--@@@DLT_SNIPPET_START tuba::postgres-->
## Additional Setup guides

- [Load data from HubSpot to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-postgres)
- [Load data from GitHub to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-postgres)
- [Load data from Chess.com to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-postgres)
- [Load data from Notion to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-postgres)
- [Load data from Google Analytics to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres)
- [Load data from Google Sheets to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-postgres)
- [Load data from Stripe to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-postgres)
<!--@@@DLT_SNIPPET_END tuba::postgres-->
3 changes: 3 additions & 0 deletions docs/website/docs/dlt-ecosystem/destinations/qdrant.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,6 @@ You can find the setup instructions to run Qdrant [here](https://qdrant.tech/doc
### Syncing of `dlt` state

Qdrant destination supports syncing of the `dlt` state.

<!--@@@DLT_SNIPPET_START tuba::qdrant-->
<!--@@@DLT_SNIPPET_END tuba::qdrant-->
Loading

0 comments on commit b228ce6

Please sign in to comment.