diff --git a/docs/runtime_suite_applications/data-catalog/10_overview.md b/docs/runtime_suite_applications/data-catalog/10_overview.md new file mode 100644 index 0000000000..be142be357 --- /dev/null +++ b/docs/runtime_suite_applications/data-catalog/10_overview.md @@ -0,0 +1,19 @@ +--- +id: overview +title: Overview +sidebar_label: Overview +--- + + + +_Data Catalog_ is a Mia-Platform Marketplace application designed to configure in your Console project the +components of [Data Catalog](/docs/data-catalog/overview.mdx) solution. +It streamlines adding the necessary microservices, endpoints and configuration maps providing blueprint +from which further customization can be executed to build and deploy an ad-hoc Data Catalog solution. + +An in depth explanation of what is Mia-Platform Data Catalog, which are its components and how to configure them +can be found in the documentation section dedicated to the [product](/docs/data-catalog/overview.mdx). diff --git a/docs/runtime_suite_applications/data-catalog/_category_.json b/docs/runtime_suite_applications/data-catalog/_category_.json new file mode 100644 index 0000000000..8632b78793 --- /dev/null +++ b/docs/runtime_suite_applications/data-catalog/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Data Catalog", + "position": 10 +} \ No newline at end of file diff --git a/docs/runtime_suite_applications/data-catalog/changelog.md b/docs/runtime_suite_applications/data-catalog/changelog.md new file mode 100644 index 0000000000..e73b5b1200 --- /dev/null +++ b/docs/runtime_suite_applications/data-catalog/changelog.md @@ -0,0 +1,20 @@ +--- +id: changelog +title: Changelog +sidebar_label: CHANGELOG +--- + + + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## Unreleased + +Initial release of Data Catalog application diff --git a/docs/runtime_suite_templates/data-catalog/20_configuration.md b/docs/runtime_suite_templates/data-catalog/20_configuration.md index 8e59639c09..0157aff189 100644 --- a/docs/runtime_suite_templates/data-catalog/20_configuration.md +++ b/docs/runtime_suite_templates/data-catalog/20_configuration.md @@ -186,49 +186,6 @@ The configuration has the following main sections: } ``` -### Secret support - -In k8s environments secrets can be injected in a running workload as an environment variable, -as a standalone file or a INI key in a standalone file. Such secrets may be base64 encoded. - -_Data Catalog Agent_ configuration supports referencing such secrets inline in selected fields of the -JSON configuration file. When the field supports secrets you may write a plain string or objects. - -In case of a string the secret is considered `plain` and written in the config file. -In case of an object with `env` guard like: - -```json -{ - "type": "env", - "key": "MY_SECRET_ENV_VAR" -} -``` - -the agent will use the content of the env var `MY_SECRET_ENV_VAR`. An extra `encoding` field equal -to `base64` can be used to specify a pre-read decoded to use. - -In case of an object with `file` guard like: - -```json -{ - "type": "file", - "path": "/path/to/secret" -} -``` - -it will use the content of the file on such `path`. If the file is formatted as an `ini` file a `key` may -be specified - -```json -{ - "type": "file", - "path": "/path/to/secret", - "key": "CONNECTION_STRING" -} -``` - -An extra `encoding` field equal to `base64` can be used to specify a pre-read decoded to use. - Secretable fields are marked in the following sections. ## Connections @@ -305,7 +262,7 @@ Other keys are `host` and `port` which for a **PostgreSQL** connection are defau #### Secretable fields -`uid`, `pwd` or `params` support secrets +`uid`, `pwd` or `params` support [secrets resolution](/fast_data/configuration/secrets_resolution.md). ### Oracle @@ -461,7 +418,7 @@ Also the environment variable must be set: #### Secretable fields -`uid`, `pwd` or `params` support secrets +`uid`, `pwd` or `params` support [secrets resolution](/fast_data/configuration/secrets_resolution.md). ### MS SQL server @@ -505,7 +462,7 @@ Other keys are `host` and `port` which for a **PostgreSQL** connection are defau #### Secretable fields -`uid`, `pwd` or `params` support secrets +`uid`, `pwd` or `params` support [secrets resolution](/fast_data/configuration/secrets_resolution.md). ### MySQL @@ -549,7 +506,7 @@ Other keys are `host` and `port` which for a **PostgreSQL** connection are defau #### Secretable fields -`uid`, `pwd` or `params` support secrets +`uid`, `pwd` or `params` support [secrets resolution](/fast_data/configuration/secrets_resolution.md). ### Mia CRUD Service @@ -714,22 +671,27 @@ Now you should have everything you need to fill out the configuration parameters #### Secretable fields -`clientId`, `username`, `clientSecret`, `password`, `securityToken` or `privateKey` support secrets +`clientId`, `username`, `clientSecret`, `password`, `securityToken` or `privateKey` support [secrets resolution](/fast_data/configuration/secrets_resolution.md). ## Targets There are 4 targets available: -1. [**default**] stdout -2. file -3. mia-console +1. [**default**] `stdout` +2. `mongodb` +2. `file` +3. `mia-console` For each listed connection, after metadata is retrieved, `agent` **sequentially** sends data to the target as: -- `json` for `stdout` and `file` -- [`ndjson`](https://github.com/ndjson/ndjson-spec) for `mia-console` +- `json` for `stdout` and `file`; +- [`ndjson`](https://github.com/ndjson/ndjson-spec) for `mia-console`. +- [`BSON`](https://bsonspec.org/) for `mongodb` -The final content is an `array` of models. Model spec is given in the form of a JSON schema. +The final content is an `array` of models, where the format of its records changes accordingly to the target: + +- `stdout`, `file` and `mia-console`: the models are written in the native agent format, which is defined in the following JSON schema; +- `mongodb`: the models are written in a format that is supported by the [Data Catalog](/data_catalog/overview.mdx) application, as defined in the following JSON schema; ### Standard Output @@ -744,6 +706,29 @@ To explicitly configure the `stdout` target use: } ``` +### MongoDB + +The MongoDB target enables Data Catalog Agent to feed data from external sources to the [Data Catalog](/data_catalog/overview.mdx) application. + +To configure the `mongodb` target use: + +```js +{ + // ... + "target": { + "type": "mongodb", + "url": "mongodb://test:27017/?replicaSet=rs", // 👈 mongodb connection string: the database must be a replica set + "database": "test_database", // 👈 if defined, it will be used as default database to store the models + } +} +``` + +The target will write the content of the connections to a MongoDB replica set database, in a collection named `open-lineage-datasets`. + +:::tip +To enforce document validation on that collection, be sure to run [Data Catalog Configuration Scripts](/data_catalog/database_setup.mdx) before executing the agent. +::: + ### File To configure the `file` target use: @@ -771,6 +756,10 @@ which will save output files in the folder `./output`. To override this use: ### MIA Console +:::caution +This target has been **deprecated** in favour of [`mongodb`](#mongodb) to support [Data Catalog](/data_catalog/overview.mdx) solution. +::: + To configure the `mia-console` target use: ```json diff --git a/docs/runtime_suite_templates/data-catalog/changelog.md b/docs/runtime_suite_templates/data-catalog/changelog.md index 6a83a01975..68387bce7f 100644 --- a/docs/runtime_suite_templates/data-catalog/changelog.md +++ b/docs/runtime_suite_templates/data-catalog/changelog.md @@ -15,6 +15,34 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.3.2] - 2024-09-20 + +### Added + +#### Targets + +- `mongodb` target. Models will be stored on a dedicated collection with the following target configuration: + ```json + { + // ... + "target": { + "type": "mongodb", + "url": "mongodb://test:27017/?replicaSet=rs", // 👈 mongodb connection string: the database must be a replica set + "database": "test_database", // 👈 if defined, it will be used as default database to store the models + } + } + ``` + + The record will be stored in a collection named `open-lineage-datasets`. + + > **NOTE:** + > + > To use MongoDB as a target, the database must be configured as a replica set. + +### Updated + +- _Data Catalog Agent_ bumped to version `0.6.4` + ## [1.3.1] - 2024-07-31 ### Updated diff --git a/static/docs_files_to_download/data-catalog/model.schema.json b/static/docs_files_to_download/data-catalog/agent.model.schema.json similarity index 100% rename from static/docs_files_to_download/data-catalog/model.schema.json rename to static/docs_files_to_download/data-catalog/agent.model.schema.json diff --git a/static/docs_files_to_download/data-catalog/catalog.model.schema.json b/static/docs_files_to_download/data-catalog/catalog.model.schema.json new file mode 100644 index 0000000000..8576fbdaa6 --- /dev/null +++ b/static/docs_files_to_download/data-catalog/catalog.model.schema.json @@ -0,0 +1,367 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "OpenLineage", + "examples": [ + { + "producer": "my_producer", + "schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json", + "eventTime": "2024-06-27 10:43:30.889698872 UTC", + "dataset": { + "datasetId": "mock-id", + "name": "books", + "namespace": "test", + "schema": "my-schema", + "catalogName": "test", + "facets": { + "schema": { + "_producer": "my_producer", + "_schemaURL": "customSchemaDatasetFacet.json", + "jsonSchema": { + "type": "object", + "additionalProperties": false, + "properties": { + "FIRST_NAME": { + "type": "string" + }, + "ID": { + "type": "number" + }, + "INSERTED_ON": {}, + "LAST_NAME": { + "type": "string" + } + } + }, + "fields": [ + { + "name": "FIRST_NAME", + "type": "VARCHAR2", + "required": false, + "unique": false, + "primaryKey": false, + "nullable": false, + "sourceDescription": "size=255, precision=0" + }, + { + "name": "ID", + "type": "NUMBER", + "required": true, + "unique": true, + "primaryKey": true, + "nullable": false, + "sourceDescription": "size=0, precision=0" + }, + { + "name": "INSERTED_ON", + "type": "TIMESTAMP(6)", + "required": true, + "unique": true, + "primaryKey": true, + "nullable": false, + "sourceDescription": "size=0, precision=0" + }, + { + "name": "LAST_NAME", + "type": "VARCHAR2", + "required": false, + "unique": false, + "primaryKey": false, + "nullable": false, + "sourceDescription": "size=255, precision=0" + } + ] + }, + "version": { + "_producer": "my_producer", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-1/DatasetVersionDatasetFacet.json", + "datasetVersion": "1" + }, + "storage": { + "_producer": "my_producer", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/StorageDatasetFacet.json", + "storageLayer": "oracle19c_test", + "version": "Oracle Database 19c Enterprise Edition Release 19.0.0.0.0 - Production", + "vendor": "mysql" + } + } + } + } + ], + "type": "object", + "required": [ + "dataset", + "eventTime", + "producer" + ], + "properties": { + "dataset": { + "$ref": "#/definitions/Dataset" + }, + "eventTime": { + "type": "string" + }, + "producer": { + "type": "string" + }, + "schemaURL": { + "default": "https://openlineage.io/spec/2-0-2/OpenLineage.json", + "type": "string" + } + }, + "definitions": { + "Dataset": { + "type": "object", + "required": [ + "datasetId", + "facets", + "name", + "namespace" + ], + "properties": { + "catalogName": { + "type": [ + "string", + "null" + ] + }, + "datasetId": { + "type": "string" + }, + "facets": { + "$ref": "#/definitions/Facets" + }, + "name": { + "type": "string" + }, + "namespace": { + "type": "string" + }, + "schema": { + "type": [ + "string", + "null" + ] + } + } + }, + "FacetSchema": { + "type": "object", + "required": [ + "_producer", + "_schemaURL", + "fields" + ], + "properties": { + "_producer": { + "type": "string" + }, + "_schemaURL": { + "type": "string" + }, + "fields": { + "type": "array", + "items": { + "$ref": "#/definitions/Field" + } + }, + "jsonSchema": { + "default": { + "type": "object", + "additionalProperties": false, + "properties": {} + }, + "allOf": [ + { + "$ref": "#/definitions/json-schema" + } + ] + } + } + }, + "FacetStorage": { + "type": "object", + "required": [ + "_producer", + "_schemaURL", + "storageLayer", + "vendor", + "version" + ], + "properties": { + "_producer": { + "type": "string" + }, + "_schemaURL": { + "type": [ + "string", + "null" + ] + }, + "storageLayer": { + "type": "string" + }, + "vendor": { + "type": "string" + }, + "version": { + "type": "string" + } + } + }, + "FacetVersion": { + "type": "object", + "required": [ + "_producer", + "_schemaURL", + "datasetVersion" + ], + "properties": { + "_producer": { + "type": "string" + }, + "_schemaURL": { + "type": "string" + }, + "datasetVersion": { + "type": "string" + } + } + }, + "Facets": { + "type": "object", + "required": [ + "schema", + "storage", + "version" + ], + "properties": { + "schema": { + "$ref": "#/definitions/FacetSchema" + }, + "storage": { + "$ref": "#/definitions/FacetStorage" + }, + "version": { + "$ref": "#/definitions/FacetVersion" + } + } + }, + "Field": { + "type": "object", + "required": [ + "name", + "nullable", + "primaryKey", + "required", + "sourceDescription", + "type", + "unique" + ], + "properties": { + "default": { + "type": [ + "string", + "null" + ] + }, + "name": { + "type": "string" + }, + "nullable": { + "type": "boolean" + }, + "primaryKey": { + "type": "boolean" + }, + "required": { + "type": "boolean" + }, + "sourceDescription": { + "description": "field description extracted from the source system", + "type": "string" + }, + "type": { + "type": "string" + }, + "unique": { + "type": "boolean" + } + } + }, + "json-primitive-type": { + "type": "string", + "enum": [ + "null", + "boolean", + "number", + "integer", + "string", + "array", + "object" + ] + }, + "json-property": { + "type": "object", + "properties": { + "default": true, + "nullable": { + "writeOnly": true, + "type": [ + "boolean", + "null" + ] + }, + "type": { + "anyOf": [ + { + "$ref": "#/definitions/json-type" + }, + { + "type": "null" + } + ] + } + } + }, + "json-schema": { + "type": "object", + "required": [ + "properties", + "type" + ], + "properties": { + "additionalProperties": { + "type": "boolean" + }, + "properties": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/json-property" + } + }, + "required": { + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "$ref": "#/definitions/json-primitive-type" + } + } + }, + "json-type": { + "anyOf": [ + { + "$ref": "#/definitions/json-primitive-type" + }, + { + "type": "array", + "items": { + "$ref": "#/definitions/json-primitive-type" + } + } + ] + } + } +} \ No newline at end of file