From e0998406c7e02a4019e25face9a5bb4eac3e58cf Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Fri, 8 Dec 2023 19:45:13 +0100 Subject: [PATCH] Improve documentation --- .gitignore | 1 + README.md | 180 +++++++++++++++++++++++++++++++++--- docs/backlog.md | 25 +++++ pyproject.toml | 7 +- target_cratedb/connector.py | 21 +++++ 5 files changed, 220 insertions(+), 14 deletions(-) create mode 100644 docs/backlog.md diff --git a/.gitignore b/.gitignore index c79d55d..f1eb116 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ .idea .venv* *.egg-info +build coverage.xml diff --git a/README.md b/README.md index 3e0c5c0..c2bedbc 100644 --- a/README.md +++ b/README.md @@ -9,26 +9,184 @@ [![PyPI](https://img.shields.io/pypi/v/meltano-target-cratedb.svg)](https://pypi.org/project/meltano-target-cratedb/) [![Downloads](https://pepy.tech/badge/meltano-target-cratedb/month)](https://pepy.tech/project/meltano-target-cratedb/) + ## About A [Singer] target for [CrateDB], built with the [Meltano SDK] for custom extractors -and loaders, and based on the PostgreSQL target [meltanolabs-target-postgres]. +and loaders, and based on the [Meltano PostgreSQL target]. It connects a library of +[600+ connectors] with CrateDB, and vice versa. +In Singer ELT jargon, a "target" conceptually wraps a data sink, where you +"load" data into. -## Details +Singer, Meltano, and PipelineWise provide foundational components and +an integration engine for composable Open Source ETL with [600+ connectors]. +On the database integration side, they are heavily based on [SQLAlchemy]. -In Singer ELT jargon, a "target" conceptionally wraps a data sink, where you -"load" data into. -CrateDB is a distributed and scalable SQL database for storing and analyzing +### CrateDB + +[CrateDB] is a distributed and scalable SQL database for storing and analyzing massive amounts of data in near real-time, even with complex queries. It is -PostgreSQL-compatible, and based on Apache Lucene. +PostgreSQL-compatible, and based on [Apache Lucene]. + +CrateDB offers a Python SQLAlchemy dialect, in order to plug into the +comprehensive Python data-science and -wrangling ecosystems. + +### Singer + +_The open-source standard for writing scripts that move data._ + +[Singer] is an open source specification and software framework for [ETL]/[ELT] +data exchange between a range of different systems. For talking to SQL databases, +it employs a metadata subsystem based on SQLAlchemy. + +Singer reads and writes Singer-formatted messages, following the [Singer Spec]. +Effectively, those are JSONL files. + +### Meltano + +_Unlock all the data that powers your data platform._ + +_Say goodbye to writing, maintaining, and scaling your own API integrations +with Meltano's declarative code-first data integration engine, bringing +600+ APIs and DBs to the table._ + +[Meltano] builds upon Singer technologies, uses configuration files in YAML +syntax instead of JSON, adds an improved SDK and other components, and runs +the central addon registry, [meltano | Hub]. + +### PipelineWise + +[PipelineWise] is another Data Pipeline Framework using the Singer.io +specification to ingest and replicate data from various sources to +various destinations. The list of [PipelineWise Taps] include another +20+ high-quality data-source and -sink components. + +### SQLAlchemy + +[SQLAlchemy] is the leading Python SQL toolkit and Object Relational Mapper +that gives application developers the full power and flexibility of SQL. + +It provides a full suite of well known enterprise-level persistence patterns, +designed for efficient and high-performing database access, adapted into a +simple and Pythonic domain language. + + +## Install + +Usually, you will not install this package directly, but on behalf +of a Meltano definition instead, for example. A corresponding snippet +is outlined in the next section. After adding it to your `meltano.yml` +configuration file, you can install all defined components and their +dependencies. +``` +meltano install +``` + + +## Usage + +This section demonstrates two variants to use the CrateDB Singer target. + +### Meltano + +Usually, `meltano-target-cratedb` will be used within a whole pipeline +definition, for example on behalf of a Meltano project. Within your +`meltano.yml` configuration file, this is an appropriate snippet to +configure `target-cratedb`. + +```yaml +- name: target-cratedb + namespace: cratedb + variant: cratedb + pip_url: meltano-target-cratedb + config: + sqlalchemy_url: crate://crate@localhost/ + add_record_metadata: true +``` + +Then, you would invoke the pipeline by using `meltano run`. +```shell +meltano run tap-xyz target-cratedb +``` + +### Standalone + +You can also invoke it standalone by using the `target-cratedb` program. +This example demonstrates how to load a file into the database. + +First, acquire an example file in Singer format, including the list of +countries of the world. +```shell +wget https://github.com/MeltanoLabs/target-postgres/raw/v0.0.9/target_postgres/tests/data_files/tap_countries.singer +``` + +Now, define the database connection string including credentials in +SQLAlchemy format. +```shell +echo '{"sqlalchemy_url": "crate://crate@localhost/"}' > settings.json +``` + +By using Unix pipes, load the data file into the database, referencing +the path to the configuration file. +```shell +cat tap_countries.singer | target-cratedb --config=settings.json +``` + +Using the interactive terminal program, `crash`, you can run SQL +statements on CrateDB. +```shell +pip install crash +crash --hosts localhost:4200 +``` + +Now, you can verify that the data has been loaded properly. +```sql +SELECT + "code", "name", "capital", "emoji", "languages[1]" +FROM + "melty"."countries" +ORDER BY + "name" +LIMIT + 42; +``` + + +## Development + +In order to work on this adapter dialect on behalf of a real pipeline definition, +link your sandbox to a development installation of [meltano-target-cratedb], and +configure the `pip_url` of the component to point to a different location than the +[vanilla package on PyPI]. + +Use this URL to directly point to a specific Git repository reference. +```yaml +pip_url: git+https://github.com/crate-workbench/meltano-target-cratedb.git@main +``` -Singer and Meltano provide foundational components and an integration engine for -composable Open Source ETL with 600+ connectors +Use a `pip`-like notation to link the CrateDB Singer target in development mode, +so you can work on it at the same time while running the pipeline, and iterating +on its definition. +```yaml +pip_url: --editable=/path/to/sources/meltano-target-cratedb +``` -[CrateDB]: https://github.com/crate/crate +[600+ connectors]: https://hub.meltano.com/ +[Apache Lucene]: https://lucene.apache.org/ +[CrateDB]: https://cratedb.com/product +[ELT]: https://en.wikipedia.org/wiki/Extract,_load,_transform +[ETL]: https://en.wikipedia.org/wiki/Extract,_transform,_load +[Meltano]: https://meltano.com/ +[meltano | Hub]: https://hub.meltano.com/ [Meltano SDK]: https://github.com/meltano/sdk -[meltanolabs-target-postgres]: https://pypi.org/project/meltanolabs-target-postgres/ -[Singer]: https://github.com/singer-io +[Meltano PostgreSQL target]: https://pypi.org/project/meltanolabs-target-postgres/ +[meltano-target-cratedb]: https://github.com/crate-workbench/meltano-target-cratedb +[Singer]: https://www.singer.io/ +[Singer Spec]: https://hub.meltano.com/singer/spec/ +[PipelineWise]: https://transferwise.github.io/pipelinewise/ +[PipelineWise Taps]: https://transferwise.github.io/pipelinewise/user_guide/yaml_config.html +[SQLAlchemy]: https://www.sqlalchemy.org/ +[vanilla package on PyPI]: https://pypi.org/project/meltano-target-cratedb/ diff --git a/docs/backlog.md b/docs/backlog.md new file mode 100644 index 0000000..428a3c9 --- /dev/null +++ b/docs/backlog.md @@ -0,0 +1,25 @@ +# Backlog + +## Iteration +1 +- Unlock configuring dedicated database schema, not just `melty`. + `"crate://crate@localhost/?schema=foo"` +- Document use with CrateDB Cloud. +- Venerable schema name propagation flaw hits again, but differently? + ``` + TypeError: PostgresConnector.get_table_columns() got an unexpected keyword argument 'full_table_name' + ``` +- Submit a few patches to `meltanolabs-target-postgres`, about proper + quoting of schema and table names. +- Submit a few other patches to crate-python, in order to clean up here. + +## Obstacles +Upstream some workarounds to crate-python. +- `TypeError: Invalid argument(s) 'json_serializer','json_deserializer' sent to create_engine(), using configuration CrateDialect/QueuePool/Engine. Please check that the keyword arguments are appropriate for this combination of components.` +- `UnsupportedFeatureException[Cannot use columns of type "object" as primary key]` +- `NotImplementedError: Default TypeEngine.as_generic() heuristic method was unsuccessful for crate.client.sqlalchemy.types._ObjectArray. A custom as_generic() method must be implemented for this type class.` +- `sqlalchemy.exc.DBAPIError: (crate.client.exceptions.TimezoneUnawareException) Timezone aware datetime objects are not supported` +- `NotImplementedError: This backend does not support multiple-table criteria within UPDATE` +- `ColumnValidationException[Validation failed for code: Updating a primary key is not supported]` + +## Notes +- Missing `CREATE SCHEMA` is tedious, and currently needs a workaround. diff --git a/pyproject.toml b/pyproject.toml index 0c26e0b..d5e7670 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,21 +11,22 @@ default-tag = "0.0.0" [project] name = "meltano-target-cratedb" -description = "A Singer target for CrateDB, built with the Meltano SDK for Singer Targets, based on the PostgreSQL target" +description = "A Singer target for CrateDB, built with the Meltano SDK, and based on the Meltano PostgreSQL target." readme = "README.md" keywords = [ "cratedb", "data-loading", "data-processing", + "data-toolkit", "data-transfer", + "data-transformation", "ELT", "ETL", "io", "Meltano", "Meltano SDK", - "Postgres", + "PostgreSQL", "Singer", - "toolkit", ] license = { text = "MIT" } authors = [ diff --git a/target_cratedb/connector.py b/target_cratedb/connector.py index c7445fe..63a0745 100644 --- a/target_cratedb/connector.py +++ b/target_cratedb/connector.py @@ -35,6 +35,11 @@ class CrateDBConnector(PostgresConnector): allow_temp_tables: bool = False # Whether temp tables are supported. def create_engine(self) -> sqlalchemy.Engine: + """ + Create an SQLAlchemy engine object. + + Note: Needs to be patched to establish a polyfill which will synchronize write operations. + """ engine = super().create_engine() polyfill_refresh_after_dml_engine(engine) return engine @@ -43,6 +48,8 @@ def create_engine(self) -> sqlalchemy.Engine: def to_sql_type(jsonschema_type: dict) -> sqlalchemy.types.TypeEngine: """Return a JSON Schema representation of the provided type. + Note: Needs to be patched to invoke other static methods on `CrateDBConnector`. + By default will call `typing.to_sql_type()`. Developers may override this method to accept additional input argument types, @@ -89,6 +96,8 @@ def to_sql_type(jsonschema_type: dict) -> sqlalchemy.types.TypeEngine: def pick_individual_type(jsonschema_type: dict): """Select the correct sql type assuming jsonschema_type has only a single type. + Note: Needs to be patched to supply handlers for `object` and `array`. + Args: jsonschema_type: A jsonschema_type array containing only a single type. @@ -115,6 +124,8 @@ def pick_individual_type(jsonschema_type: dict): def pick_best_sql_type(sql_type_array: list): """Select the best SQL type from an array of instances of SQL type classes. + Note: Needs to be patched to supply handler for `ObjectTypeImpl`. + Args: sql_type_array: The array of instances of SQL type classes. @@ -152,6 +163,8 @@ def _sort_types( ) -> list[sqlalchemy.types.TypeEngine]: """Return the input types sorted from most to least compatible. + Note: Needs to be patched to supply handlers for `_ObjectArray` and `NOTYPE`. + For example, [Smallint, Integer, Datetime, String, Double] would become [Unicode, String, Double, Integer, Smallint, Datetime]. @@ -201,6 +214,8 @@ def copy_table_structure( ) -> sqlalchemy.Table: """Copy table structure. + Note: Needs to be patched to prevent `Primary key columns cannot be nullable` errors. + Args: full_table_name: the target table name potentially including schema from_table: the source table @@ -224,3 +239,9 @@ def copy_table_structure( new_table = sqlalchemy.Table(table_name, meta, *columns) new_table.create(bind=connection) return new_table + + def prepare_schema(self, schema_name: str) -> None: + """ + Don't emit `CREATE SCHEMA` statements to CrateDB. + """ + pass