Skip to content

Commit

Permalink
Accept :memory: mode for credentials parameter in duckdb factory (#1297)
Browse files Browse the repository at this point in the history
* Accept :memory: mode for credentials parameter in duckdb factory

* Check instance of native_value and add check in test for in-memory mode

* Add a separate test for in-memory mode when using duckdb

* Fix failing test, bind dataset_name to configuration

* Adjust test, take ownership when :memory: has been passed

* Revert changes

* Adjust documentation of duckdb

* Add a new exception for duckdb

* Adjust error message

* Remove backticks

* Fix typo

* Update docs

* Update duckdb docs, merge examples

* Remove the mention of :memory: from docstrings

* Adjust the message in exception

* Catch :memory: in DuckDbCredentials.on_resolve

* Update tests

* Rename exception

* Update docs

* Use Destination.from_reference in code snippet

* Add one more test for Destination.from_reference and update docs

* Format code and ignore mypy error

* Use standard way to initialize destination via factory

* Preserve environment

* Cleanup duckdb docs code snippets

* Adjust memo about :pipeline: connection string

* Show tables from in-memory schema

* Explicitly mention python script instead of pipeline

* Fix typo

* Reword the description of :pipeline: value

* Update docs/website/docs/dlt-ecosystem/destinations/duckdb.md

Co-authored-by: VioletM <[email protected]>

* Re-arrange text blocks

* Add example on how to use :pipeline: connection string

* Fix typo

* Adjust warning message

---------

Co-authored-by: VioletM <[email protected]>
  • Loading branch information
sultaniman and VioletM authored May 7, 2024
1 parent e329ab9 commit 30f0416
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 10 deletions.
7 changes: 6 additions & 1 deletion dlt/destinations/impl/duckdb/configuration.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
import dataclasses
import threading
from pathvalidate import is_valid_filepath

from typing import Any, ClassVar, Dict, Final, List, Optional, Tuple, Type, Union

from pathvalidate import is_valid_filepath
from dlt.common import logger
from dlt.common.configuration import configspec
from dlt.common.configuration.specs import ConnectionStringCredentials
from dlt.common.configuration.specs.exceptions import InvalidConnectionString
from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration
from dlt.common.typing import TSecretValue
from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials

try:
from duckdb import DuckDBPyConnection
Expand Down Expand Up @@ -117,6 +119,9 @@ def is_partial(self) -> bool:
return self.database == ":pipeline:"

def on_resolved(self) -> None:
if isinstance(self.database, str) and self.database == ":memory:":
raise InvalidInMemoryDuckdbCredentials()

# do not set any paths for external database
if self.database == ":external:":
return
Expand Down
11 changes: 11 additions & 0 deletions dlt/destinations/impl/duckdb/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from dlt.common.destination.exceptions import DestinationTerminalException


class InvalidInMemoryDuckdbCredentials(DestinationTerminalException):
def __init__(self) -> None:
super().__init__(
"To use in-memory instance of duckdb, "
"please instantiate it first and then pass to destination factory\n"
'\nconn = duckdb.connect(":memory:")\n'
'dlt.pipeline(pipeline_name="...", destination=dlt.destinations.duckdb(conn)'
)
2 changes: 1 addition & 1 deletion dlt/destinations/impl/duckdb/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
Args:
credentials: Credentials to connect to the duckdb database. Can be an instance of `DuckDbCredentials` or
a path to a database file. Use `:memory:` to create an in-memory database or :pipeline: to create a duckdb
a path to a database file. Use :pipeline: to create a duckdb
in the working folder of the pipeline
create_indexes: Should unique indexes be created, defaults to False
**kwargs: Additional arguments passed to the destination config
Expand Down
54 changes: 48 additions & 6 deletions docs/website/docs/dlt-ecosystem/destinations/duckdb.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,30 +102,72 @@ p = dlt.pipeline(
)
```

The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use. `:memory:` databases are supported.
The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use.

```py
import duckdb

db = duckdb.connect()
p = dlt.pipeline(
pipeline_name='chess',
pipeline_name="chess",
destination=dlt.destinations.duckdb(db),
dataset_name='chess_data',
dataset_name="chess_data",
full_refresh=False,
)

# Or if you would like to use in-memory duckdb instance
db = duckdb.connect(":memory:")
p = pipeline_one = dlt.pipeline(
pipeline_name="in_memory_pipeline",
destination=dlt.destinations.duckdb(db),
dataset_name="chess_data",
)

print(db.sql("DESCRIBE;"))

# Example output
# ┌──────────┬───────────────┬─────────────────────┬──────────────────────┬───────────────────────┬───────────┐
# │ database │ schema │ name │ column_names │ column_types │ temporary │
# │ varchar │ varchar │ varchar │ varchar[] │ varchar[] │ boolean │
# ├──────────┼───────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────┤
# │ memory │ chess_data │ _dlt_loads │ [load_id, schema_n… │ [VARCHAR, VARCHAR, … │ false │
# │ memory │ chess_data │ _dlt_pipeline_state │ [version, engine_v… │ [BIGINT, BIGINT, VA… │ false │
# │ memory │ chess_data │ _dlt_version │ [version, engine_v… │ [BIGINT, BIGINT, TI… │ false │
# │ memory │ chess_data │ my_table │ [a, _dlt_load_id, … │ [BIGINT, VARCHAR, V… │ false │
# └──────────┴───────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────┘
```

:::note
Be careful! The in-memory instance of the database will be destroyed, once your Python script exits.
:::

This destination accepts database connection strings in the format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration).

You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g., using a `secrets.toml` file)
```toml
destination.duckdb.credentials="duckdb:///_storage/test_quack.duckdb"
```

The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`.

A few special connection strings are supported:
* **:pipeline:** creates the database in the working directory of the pipeline with the name `quack.duckdb`.
* **:memory:** creates an in-memory database. This may be useful for testing.
Dlt supports a unique connection string that triggers specific behavior for duckdb destination:
* **:pipeline:** creates the database in the working directory of the pipeline, naming it `quack.duckdb`.

Please see the code snippets below showing how to use it

1. Via `config.toml`
```toml
destination.duckdb.credentials=":pipeline:"
```

2. In Python code
```py
p = pipeline_one = dlt.pipeline(
pipeline_name="my_pipeline",
destination="duckdb",
credentials=":pipeline:",
)
```

### Additional configuration
Unique indexes may be created during loading if the following config value is set:
Expand Down
44 changes: 42 additions & 2 deletions tests/load/duckdb/test_duckdb_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
from dlt.common.configuration.resolve import resolve_configuration
from dlt.common.configuration.utils import get_resolved_traces

from dlt.common.destination.reference import Destination
from dlt.destinations.impl.duckdb.configuration import (
DUCK_DB_NAME,
DuckDbClientConfiguration,
DuckDbCredentials,
DEFAULT_DUCK_DB_NAME,
)
from dlt.destinations import duckdb

from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials
from dlt.pipeline.exceptions import PipelineStepFailed
from tests.load.pipeline.utils import drop_pipeline
from tests.pipeline.utils import assert_table
from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT
Expand Down Expand Up @@ -56,6 +57,44 @@ def test_duckdb_open_conn_default() -> None:
delete_quack_db()


def test_duckdb_in_memory_mode_via_factory(preserve_environ):
delete_quack_db()
try:
import duckdb

# Check if passing external duckdb connection works fine
db = duckdb.connect(":memory:")
dlt.pipeline(pipeline_name="booboo", destination=dlt.destinations.duckdb(db))

# Check if passing :memory: to factory fails
with pytest.raises(PipelineStepFailed) as exc:
p = dlt.pipeline(pipeline_name="booboo", destination="duckdb", credentials=":memory:")
p.run([1, 2, 3])

assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)

os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = ":memory:"
with pytest.raises(PipelineStepFailed):
p = dlt.pipeline(
pipeline_name="booboo",
destination="duckdb",
)
p.run([1, 2, 3])

assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)

with pytest.raises(PipelineStepFailed) as exc:
p = dlt.pipeline(
pipeline_name="booboo",
destination=Destination.from_reference("duckdb", credentials=":memory:"), # type: ignore[arg-type]
)
p.run([1, 2, 3], table_name="numbers")

assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials)
finally:
delete_quack_db()


def test_duckdb_database_path() -> None:
# resolve without any path provided
c = resolve_configuration(
Expand Down Expand Up @@ -257,6 +296,7 @@ def test_external_duckdb_database() -> None:
assert c.credentials._conn_owner is False
assert hasattr(c.credentials, "_conn")
conn.close()
assert not os.path.exists(":memory:")


def test_default_duckdb_dataset_name() -> None:
Expand Down

0 comments on commit 30f0416

Please sign in to comment.