Skip to content

Commit

Permalink
Merge branch 'devel' into airflow_clear
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaFaer committed Jul 5, 2024
2 parents 4694401 + 60c7327 commit b3bf58f
Show file tree
Hide file tree
Showing 362 changed files with 13,311 additions and 4,924 deletions.
81 changes: 81 additions & 0 deletions .github/workflows/test_destination_lancedb.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: dest | lancedb

on:
pull_request:
branches:
- master
- devel
workflow_dispatch:
schedule:
- cron: '0 2 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }}

RUNTIME__SENTRY_DSN: https://[email protected]/4504819859914752
RUNTIME__LOG_LEVEL: ERROR
RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }}

ACTIVE_DESTINATIONS: "[\"lancedb\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\"]"

jobs:
get_docs_changes:
name: docs changes
uses: ./.github/workflows/get_docs_changes.yml
if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}}

run_loader:
name: dest | lancedb tests
needs: get_docs_changes
if: needs.get_docs_changes.outputs.changes_outside_docs == 'true'
defaults:
run:
shell: bash
runs-on: "ubuntu-latest"

steps:
- name: Check out
uses: actions/checkout@master

- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: "3.11.x"

- name: Install Poetry
uses: snok/[email protected]
with:
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true

- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v3
with:
path: .venv
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp

- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

- name: Install dependencies
run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline

- name: Install embedding provider dependencies
run: poetry run pip install openai

- run: |
poetry run pytest tests/load -m "essential"
name: Run essential tests Linux
if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}}
- run: |
poetry run pytest tests/load
name: Run all tests Linux
if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}}
24 changes: 23 additions & 1 deletion .github/workflows/test_doc_snippets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:

# Slack hook for chess in production example
RUNTIME__SLACK_INCOMING_HOOK: ${{ secrets.RUNTIME__SLACK_INCOMING_HOOK }}
# Path to local qdrant database
DESTINATION__QDRANT__CREDENTIALS__PATH: zendesk.qdb
# detect if the workflow is executed in a repo fork
IS_FORK: ${{ github.event.pull_request.head.repo.fork }}

Expand All @@ -32,6 +34,26 @@ jobs:
# Do not run on forks, unless allowed, secrets are used here
if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}}

# Service containers to run with `container-job`
services:
# Label used to access the service container
postgres:
# Docker Hub image
image: postgres
# Provide the password for postgres
env:
POSTGRES_DB: dlt_data
POSTGRES_USER: loader
POSTGRES_PASSWORD: loader
ports:
- 5432:5432
# Set health checks to wait until postgres has started
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:

- name: Check out
Expand Down Expand Up @@ -61,7 +83,7 @@ jobs:

- name: Install dependencies
# if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres --with docs,sentry-sdk --without airflow
run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow

- name: create secrets.toml for examples
run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/test_local_destinations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ env:
RUNTIME__SENTRY_DSN: https://[email protected]/4504819859914752
RUNTIME__LOG_LEVEL: ERROR
RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }}
ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\"]"
ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\", \"qdrant\"]"
ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\"]"

DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary
Expand Down Expand Up @@ -63,6 +63,11 @@ jobs:
--health-timeout 5s
--health-retries 5
qdrant:
image: qdrant/qdrant:v1.8.4
ports:
- 6333:6333

steps:
- name: Check out
uses: actions/checkout@master
Expand Down Expand Up @@ -90,7 +95,7 @@ jobs:
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations

- name: Install dependencies
run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline -E deltalake
run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant --with sentry-sdk --with pipeline -E deltalake

- name: create secrets.toml
run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
Expand All @@ -100,6 +105,7 @@ jobs:
name: Run tests Linux
env:
DESTINATION__POSTGRES__CREDENTIALS: postgresql://loader:loader@localhost:5432/dlt_data
DESTINATION__QDRANT__CREDENTIALS__location: http://localhost:6333

- name: Stop weaviate
if: always()
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ lint-and-test-snippets:
cd docs/website/docs && poetry run pytest --ignore=node_modules

lint-and-test-examples:
poetry run mypy --config-file mypy.ini docs/examples
poetry run flake8 --max-line-length=200 docs/examples
cd docs/tools && poetry run python prepare_examples_tests.py
poetry run flake8 --max-line-length=200 docs/examples
poetry run mypy --config-file mypy.ini docs/examples
cd docs/examples && poetry run pytest


Expand Down
10 changes: 1 addition & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,12 @@ Be it a Google Colab notebook, AWS Lambda function, an Airflow DAG, your local l

dlt supports Python 3.8+.

**pip:**
```sh
pip install dlt
```

**pixi:**
```sh
pixi add dlt
```
More options: [Install via Conda or Pixi](https://dlthub.com/docs/reference/installation#install-dlt-via-pixi-and-conda)

**conda:**
```sh
conda install -c conda-forge dlt
```

## Quick Start

Expand Down
2 changes: 1 addition & 1 deletion dlt/cli/_dlt.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def schema_command_wrapper(file_path: str, format_: str, remove_defaults: bool)
schema_str = json.dumps(s.to_dict(remove_defaults=remove_defaults), pretty=True)
else:
schema_str = s.to_pretty_yaml(remove_defaults=remove_defaults)
print(schema_str)
fmt.echo(schema_str)
return 0


Expand Down
44 changes: 41 additions & 3 deletions dlt/cli/pipeline_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from dlt.common.destination.reference import TDestinationReferenceArg
from dlt.common.runners import Venv
from dlt.common.runners.stdout import iter_stdout
from dlt.common.schema.utils import group_tables_by_resource, remove_defaults
from dlt.common.schema.utils import (
group_tables_by_resource,
has_table_seen_data,
is_complete_column,
remove_defaults,
)
from dlt.common.storages import FileStorage, PackageStorage
from dlt.pipeline.helpers import DropCommand
from dlt.pipeline.exceptions import CannotRestorePipelineException
Expand Down Expand Up @@ -180,6 +185,35 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.bold(str(res_state_slots)),
)
)
if verbosity > 0:
for table in tables:
incomplete_columns = len(
[
col
for col in table["columns"].values()
if not is_complete_column(col)
]
)
fmt.echo(
"\t%s table %s column(s) %s %s"
% (
fmt.bold(table["name"]),
fmt.bold(str(len(table["columns"]))),
(
fmt.style("received data", fg="green")
if has_table_seen_data(table)
else fmt.style("not yet received data", fg="yellow")
),
(
fmt.style(
f"{incomplete_columns} incomplete column(s)",
fg="yellow",
)
if incomplete_columns > 0
else ""
),
)
)
fmt.echo()
fmt.echo("Working dir content:")
_display_pending_packages()
Expand Down Expand Up @@ -272,7 +306,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.echo(package_info.asstr(verbosity))
if len(package_info.schema_update) > 0:
if verbosity == 0:
print("Add -v option to see schema update. Note that it could be large.")
fmt.echo("Add -v option to see schema update. Note that it could be large.")
else:
tables = remove_defaults({"tables": package_info.schema_update}) # type: ignore
fmt.echo(fmt.bold("Schema update:"))
Expand Down Expand Up @@ -316,7 +350,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
fmt.echo(
"About to drop the following data in dataset %s in destination %s:"
% (
fmt.bold(drop.info["dataset_name"]),
fmt.bold(p.dataset_name),
fmt.bold(p.destination.destination_name),
)
)
Expand All @@ -329,6 +363,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
)
)
fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"]))
fmt.echo(
"%s: %s"
% (fmt.style("\twith data in destination", fg="green"), drop.info["tables_with_data"])
)
fmt.echo(
"%s: %s"
% (
Expand Down
24 changes: 12 additions & 12 deletions dlt/common/configuration/paths.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import os
import tempfile

# dlt settings folder
DOT_DLT = ".dlt"
from dlt.common import known_env


# dlt data dir is by default not set, see get_dlt_data_dir for details
DLT_DATA_DIR: str = None
# dlt settings folder
DOT_DLT = os.environ.get(known_env.DLT_CONFIG_FOLDER, ".dlt")


def get_dlt_project_dir() -> str:
"""The dlt project dir is the current working directory but may be overridden by DLT_PROJECT_DIR env variable."""
return os.environ.get("DLT_PROJECT_DIR", ".")
return os.environ.get(known_env.DLT_PROJECT_DIR, ".")


def get_dlt_settings_dir() -> str:
Expand All @@ -27,14 +27,14 @@ def make_dlt_settings_path(path: str) -> str:


def get_dlt_data_dir() -> str:
"""Gets default directory where pipelines' data will be stored
1. in user home directory: ~/.dlt/
2. if current user is root: in /var/dlt/
3. if current user does not have a home directory: in /tmp/dlt/
4. if DLT_DATA_DIR is set in env then it is used
"""Gets default directory where pipelines' data (working directories) will be stored
1. if DLT_DATA_DIR is set in env then it is used
2. in user home directory: ~/.dlt/
3. if current user is root: in /var/dlt/
4. if current user does not have a home directory: in /tmp/dlt/
"""
if "DLT_DATA_DIR" in os.environ:
return os.environ["DLT_DATA_DIR"]
if known_env.DLT_DATA_DIR in os.environ:
return os.environ[known_env.DLT_DATA_DIR]

# geteuid not available on Windows
if hasattr(os, "geteuid") and os.geteuid() == 0:
Expand Down
4 changes: 3 additions & 1 deletion dlt/common/configuration/specs/run_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ class RunConfiguration(BaseConfiguration):
dlthub_telemetry: bool = True # enable or disable dlthub telemetry
dlthub_telemetry_endpoint: Optional[str] = "https://telemetry.scalevector.ai"
dlthub_telemetry_segment_write_key: Optional[str] = None
log_format: str = "{asctime}|[{levelname:<21}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}"
log_format: str = (
"{asctime}|[{levelname}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}"
)
log_level: str = "WARNING"
request_timeout: float = 60
"""Timeout for http requests"""
Expand Down
6 changes: 4 additions & 2 deletions dlt/common/data_writers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
DataWriterMetrics,
TDataItemFormat,
FileWriterSpec,
create_import_spec,
resolve_best_writer_spec,
get_best_writer_spec,
is_native_writer,
Expand All @@ -11,12 +12,13 @@
from dlt.common.data_writers.escape import (
escape_redshift_literal,
escape_redshift_identifier,
escape_bigquery_identifier,
escape_hive_identifier,
)

__all__ = [
"DataWriter",
"FileWriterSpec",
"create_import_spec",
"resolve_best_writer_spec",
"get_best_writer_spec",
"is_native_writer",
Expand All @@ -26,5 +28,5 @@
"new_file_id",
"escape_redshift_literal",
"escape_redshift_identifier",
"escape_bigquery_identifier",
"escape_hive_identifier",
]
Loading

0 comments on commit b3bf58f

Please sign in to comment.