diff --git a/.github/actions/install-env/action.yml b/.github/actions/install-env/action.yml index b99d328..4e18e78 100644 --- a/.github/actions/install-env/action.yml +++ b/.github/actions/install-env/action.yml @@ -35,7 +35,7 @@ runs: - name: Install dependencies shell: bash if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-ansi + run: poetry install --no-interaction --no-ansi --group duckdb - name: Activate environment shell: bash diff --git a/README.md b/README.md index e5220f4..f8e01c8 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ Right now lea is compatible with BigQuery (used at Carbonfact) and DuckDB (quack - [Jaffle shop 🥪](examples/jaffle_shop/) - [Compare development to production 👯‍♀️](examples/diff/) +- [Using MotherDuck 🦆](examples/motherduck/) ## Teaser @@ -63,8 +64,12 @@ Right now lea is compatible with BigQuery (used at Carbonfact) and DuckDB (quack ## Installation +Use one of the following commands, depending on which warehouse you wish to use: + ```sh -pip install lea-cli +pip install lea-cli[duckdb] +pip install lea-cli[motherduck] +pip install lea-cli[bigquery] ``` This installs the `lea` command. It also makes the `lea` Python library available. @@ -78,12 +83,13 @@ lea is configured by setting environment variables. The following variables are ```sh # General configuration LEA_USERNAME=max -LEA_WAREHOUSE=bigquery # DuckDB 🦆 +LEA_WAREHOUSE=duckdb LEA_DUCKDB_PATH=duckdb.db # BigQuery 🦏 +LEA_WAREHOUSE=bigquery LEA_BQ_LOCATION=EU LEA_BQ_PROJECT_ID=carbonfact-dwh LEA_BQ_DATASET_NAME=kaya @@ -127,17 +133,17 @@ views/ Each view will be named according to its location, following the warehouse convention: -| Warehouse | Dataset | Username | Schema | Table | Name | -| ----------- | --------- | -------- | -------- | ------- | -------------------------------------------- | -| DuckDB 🦆 | `dataset` | `user` | `schema` | `table` | `schema.table` (stored in `dataset_user.db`) | -| BigQuery 🦏 | `dataset` | `user` | `schema` | `table` | `dataset_user.schema__table` | +| Warehouse | Dataset | Username | Schema | Table | Name | +| --------- | --------- | -------- | -------- | ------- | -------------------------------------------- | +| DuckDB | `dataset` | `user` | `schema` | `table` | `schema.table` (stored in `dataset_user.db`) | +| BigQuery | `dataset` | `user` | `schema` | `table` | `dataset_user.schema__table` | The convention in lea to reference a table in a sub-schema is to use a double underscore `__`: -| Warehouse | Dataset | Username | Schema | Sub-schema | Table | Name | -| ----------- | --------- | -------- | -------- | ---------- | ------- | ------------------------------------------------- | -| DuckDB 🦆 | `dataset` | `user` | `schema` | `sub` | `table` | `schema.sub__table` (stored in `dataset_user.db`) | -| BigQuery 🦏 | `dataset` | `user` | `schema` | `sub` | `table` | `dataset_user.schema__sub__table` | +| Warehouse | Dataset | Username | Schema | Sub-schema | Table | Name | +| --------- | --------- | -------- | -------- | ---------- | ------- | ------------------------------------------------- | +| DuckDB | `dataset` | `user` | `schema` | `sub` | `table` | `schema.sub__table` (stored in `dataset_user.db`) | +| BigQuery | `dataset` | `user` | `schema` | `sub` | `table` | `dataset_user.schema__sub__table` | Schemas are expected to be placed under a `views` directory. This can be changed by providing an argument to the `run` command: @@ -220,10 +226,10 @@ This checkpointing logic can be disabled with the `--fresh` flag. lea run --fresh ``` -The `--raise-exceptions` flag can be used to immediately stop if a query fails: +The `--fail-fast` flag can be used to immediately stop if a query fails: ```sh -lea run --raise-exceptions +lea run --fail-fast ``` For debugging purposes, it is possible to print out a query and copy it to the clipboard: diff --git a/examples/motherduck/README.md b/examples/motherduck/README.md new file mode 100644 index 0000000..ad13539 --- /dev/null +++ b/examples/motherduck/README.md @@ -0,0 +1,34 @@ +# Using MotherDuck + +lea works with DuckDB, and thus can be used with [MotherDuck](https://motherduck.com/) too. + +Here is an example `.env` file: + +```sh +echo " +LEA_USERNAME=max +LEA_WAREHOUSE=duckdb +LEA_DUCKDB_PATH=md:jaffle_shop +LEA_MOTHERDUCK_TOKEN= +" > .env +``` + +The token can be obtained by logging into MotherDuck from the terminal, as documented [here](https://motherduck.com/docs/getting-started/connect-query-from-python/installation-authentication#authenticating-to-motherduck). + +Then, you can run the usual commands. For the sake of example, let's re-use the jaffle shop views: + +```sh +lea prepare ../jaffle_shop/views +``` + +``` +Created schema analytics +Created schema staging +Created schema core +``` + +```sh +lea run ../jaffle_shop/views +``` + +You should see the views in your MotherDuck UI: diff --git a/lea/app/__init__.py b/lea/app/__init__.py index 688b203..e50fe2c 100644 --- a/lea/app/__init__.py +++ b/lea/app/__init__.py @@ -60,7 +60,7 @@ def run( production: bool = False, threads: int = 8, show: int = 20, - raise_exceptions: bool = False, + fail_fast: bool = False, env: str = EnvPath, ): from lea.app.run import run @@ -81,7 +81,7 @@ def run( fresh=fresh, threads=threads, show=show, - raise_exceptions=raise_exceptions, + fail_fast=fail_fast, console=console, ) @@ -92,7 +92,7 @@ def test( only: list[str] = typer.Option(None), threads: int = 8, production: bool = False, - raise_exceptions: bool = False, + fail_fast: bool = False, env: str = EnvPath, ): from lea.app.test import test @@ -105,7 +105,7 @@ def test( views_dir=views_dir, only=only, threads=threads, - raise_exceptions=raise_exceptions, + fail_fast=fail_fast, console=console, ) diff --git a/lea/app/run.py b/lea/app/run.py index 3b03486..2fa00c7 100644 --- a/lea/app/run.py +++ b/lea/app/run.py @@ -175,7 +175,7 @@ def run( fresh: bool, threads: int, show: int, - raise_exceptions: bool, + fail_fast: bool, console: rich.console.Console, ): # If print_to_cli, it means we only want to print out the view definitions, nothing else @@ -195,16 +195,15 @@ def run( console_log(f"{len(whitelist):,d} view(s) selected") # Remove orphan views - for schema, table in client.list_existing_view_names(): - if (schema, table) in dag: + for key, (schema, table) in client.list_existing_view_names().items(): + if key in dag: continue - console_log(f"Removing {schema}.{table}") if not dry: view_to_delete = lea.views.GenericSQLView( schema=schema, name=table, query="", sqlglot_dialect=client.sqlglot_dialect ) client.delete_view(view=view_to_delete) - console_log(f"Removed {schema}.{table}") + console_log(f"Removed {'.'.join(key)}") def display_progress() -> rich.table.Table: if print_to_cli: @@ -322,5 +321,5 @@ def display_progress() -> rich.table.Table: console.print(str(dag[node]), style="bold red") console.print(exception) - if raise_exceptions: + if fail_fast: raise Exception("Some views failed to build") diff --git a/lea/app/test.py b/lea/app/test.py index f2de262..ef9617b 100644 --- a/lea/app/test.py +++ b/lea/app/test.py @@ -12,7 +12,7 @@ def test( views_dir: str, only: list[str], threads: int, - raise_exceptions: bool, + fail_fast: bool, console: rich.console.Console, ): # List all the columns @@ -53,5 +53,5 @@ def test( else: console.log(f"FAILURE {test}", style="bold red") console.log(conflicts.head()) - if raise_exceptions: + if fail_fast: raise RuntimeError(f"Test {test} failed") diff --git a/lea/clients/duckdb.py b/lea/clients/duckdb.py index 4e531d4..455b9af 100644 --- a/lea/clients/duckdb.py +++ b/lea/clients/duckdb.py @@ -14,13 +14,20 @@ class DuckDB(Client): def __init__(self, path: str, username: str | None): - if username is not None: - _path = pathlib.Path(path) - path = str((_path.parent / f"{_path.stem}_{username}{_path.suffix}").absolute()) + if path.startswith("md:"): + path = f"{path}_{username}" if username is not None else path + else: + if username is not None: + _path = pathlib.Path(path) + path = str((_path.parent / f"{_path.stem}_{username}{_path.suffix}").absolute()) self.path = path self.username = username self.con = duckdb.connect(self.path) + @property + def is_motherduck(self): + return self.path.startswith("md:") + @property def sqlglot_dialect(self): return sqlglot.dialects.Dialects.DUCKDB @@ -53,20 +60,21 @@ def teardown(self): def list_existing_view_names(self) -> list[tuple[str, str]]: query = """ - SELECT - CASE - WHEN POSITION('_' IN table_schema) > 0 - THEN SUBSTRING(table_schema FROM 1 FOR POSITION('_' IN table_schema) - 1) - ELSE table_schema - END table_schema, - table_name - FROM information_schema.tables - + SELECT + table_schema, + table_name + FROM information_schema.tables """ - return [ - (r["table_schema"], r["table_name"]) + if self.is_motherduck: + database = self.path.split(":")[1] + query += f"\nWHERE table_catalog = '{database}'" + return { + (r["table_schema"], *r["table_name"].split(lea._SEP)): ( + r["table_schema"], + r["table_name"], + ) for r in self.con.sql(query).df().to_dict(orient="records") - ] + } def get_tables(self): query = """ diff --git a/pyproject.toml b/pyproject.toml index 283f2e4..c567dde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lea-cli" -version = "0.2.3" +version = "0.2.4" description = "A minimalist alternative to dbt" authors = ["Max Halford "] packages = [ @@ -12,15 +12,20 @@ python = ">=3.10" sqlglot = "^18.10.0" Jinja2 = "^3.1.2" typer = "^0.9.0" -google-cloud-bigquery = "^3.11.4" -db-dtypes = "^1.1.1" requests = "^2.31.0" python-dotenv = "^1.0.0" rich = "^13.5.3" tabulate = "^0.9.0" -pandas-gbq = "^0.19.2" duckdb = "^0.9.0" +[tool.poetry.group.duckdb.dependencies] +duckdb = "^0.9.1" + +[tool.poetry.group.bigquery.dependencies] +google-cloud-bigquery = "^3.11.4" +db-dtypes = "^1.1.1" +pandas-gbq = "^0.19.2" + [tool.poetry.group.dev.dependencies] ipykernel = "^6.21.2" pytest = "^7.4.2"