Skip to content

Commit

Permalink
add intro examples
Browse files Browse the repository at this point in the history
  • Loading branch information
sh-rp committed Sep 6, 2024
1 parent 5ef6672 commit 0830f56
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 2 deletions.
3 changes: 2 additions & 1 deletion dlt/cli/init_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,8 @@ def init_command(
(known_sections.SOURCES, source_name),
)

if len(checked_sources) == 0:
# the intro template does not use sources, for now allow it to pass here
if len(checked_sources) == 0 and source_name != "intro":
raise CliCommandException(
"init",
f"The pipeline script {source_configuration.src_pipeline_script} is not creating or"
Expand Down
82 changes: 82 additions & 0 deletions dlt/sources/pipeline_templates/intro_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""The Intro Pipeline Template contains the example from the docs intro page"""

# mypy: disable-error-code="no-untyped-def,arg-type"

import pandas as pd
import sqlalchemy as sa

import dlt
from dlt.sources.helpers import requests


def load_api_data() -> None:
"""Load data from the chess api, for more complex examples use our rest_api source"""

# Create a dlt pipeline that will load
# chess player data to the DuckDB destination
pipeline = dlt.pipeline(
pipeline_name="chess_pipeline", destination="duckdb", dataset_name="player_data"
)
# Grab some player data from Chess.com API
data = []
for player in ["magnuscarlsen", "rpragchess"]:
response = requests.get(f"https://api.chess.com/pub/player/{player}")
response.raise_for_status()
data.append(response.json())

# Extract, normalize, and load the data
load_info = pipeline.run(data, table_name="player")
print(load_info) # noqa: T201


def load_pandas_data() -> None:
"""Load data from a public csv via pandas"""

owid_disasters_csv = (
"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/"
"Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/"
"Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv"
)
df = pd.read_csv(owid_disasters_csv)
data = df.to_dict(orient="records")

pipeline = dlt.pipeline(
pipeline_name="from_csv",
destination="duckdb",
dataset_name="mydata",
)
load_info = pipeline.run(data, table_name="natural_disasters")

print(load_info) # noqa: T201


def load_sql_data() -> None:
"""Load data from a sql database with sqlalchemy, for more complex examples use our sql_database source"""

# Use any SQL database supported by SQLAlchemy, below we use a public
# MySQL instance to get data.
# NOTE: you'll need to install pymysql with `pip install pymysql`
# NOTE: loading data from public mysql instance may take several seconds
engine = sa.create_engine("mysql+pymysql://[email protected]:4497/Rfam")

with engine.connect() as conn:
# Select genome table, stream data in batches of 100 elements
query = "SELECT * FROM genome LIMIT 1000"
rows = conn.execution_options(yield_per=100).exec_driver_sql(query)

pipeline = dlt.pipeline(
pipeline_name="from_database",
destination="duckdb",
dataset_name="genome_data",
)

# Convert the rows into dictionaries on the fly with a map function
load_info = pipeline.run(map(lambda row: dict(row._mapping), rows), table_name="genome")

print(load_info) # noqa: T201


if __name__ == "__main__":
load_api_data()
load_pandas_data()
load_sql_data()
2 changes: 1 addition & 1 deletion tests/cli/test_init_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
CORE_SOURCES = ["filesystem", "rest_api", "sql_database"]

# we also hardcode all the templates here for testing
TEMPLATES = ["debug", "default", "arrow", "requests", "dataframe"]
TEMPLATES = ["debug", "default", "arrow", "requests", "dataframe", "intro"]

# a few verified sources we know to exist
SOME_KNOWN_VERIFIED_SOURCES = ["chess", "sql_database", "google_sheets", "pipedrive"]
Expand Down
10 changes: 10 additions & 0 deletions tests/sources/test_pipeline_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,13 @@ def test_requests_pipeline(example_name: str) -> None:
from dlt.sources.pipeline_templates import requests_pipeline

getattr(requests_pipeline, example_name)()


@pytest.mark.parametrize(
"example_name",
("load_api_data", "load_sql_data", "load_pandas_data"),
)
def test_intro_pipeline(example_name: str) -> None:
from dlt.sources.pipeline_templates import intro_pipeline

getattr(intro_pipeline, example_name)()

0 comments on commit 0830f56

Please sign in to comment.