From f57a1b8ace68bba843184dc1a8414b418db061ec Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Fri, 25 Oct 2024 13:45:42 +0800 Subject: [PATCH] Don't allow for additional properties for top level ODCS v2, add in end-to-end test for ODCS v3 --- .../OpenDataContractStandardModels.scala | 8 +- .../OpenDataContractStandardV3Models.scala | 33 ++- .../metadata/odcs/full-example-v3.odcs.yaml | 233 ++++++++++++++++++ ...ntractStandardDataSourceMetadataTest.scala | 38 +-- gradle.properties | 2 +- 5 files changed, 289 insertions(+), 25 deletions(-) create mode 100644 app/src/test/resources/sample/metadata/odcs/full-example-v3.odcs.yaml diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardModels.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardModels.scala index 6199501..25a96f0 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardModels.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardModels.scala @@ -2,7 +2,6 @@ package io.github.datacatering.datacaterer.core.generator.metadata.datasource.op import com.fasterxml.jackson.annotation.JsonIgnoreProperties -@JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandard( dataset: Array[OpenDataContractStandardDataset], datasetName: String, @@ -12,9 +11,12 @@ case class OpenDataContractStandard( uuid: String, version: String, apiVersion: Option[String] = None, + contractCreatedTs: Option[String] = None, + customProperties: Option[Array[OpenDataContractStandardCustomProperty]] = None, database: Option[String] = None, datasetDomain: Option[String] = None, datasetKind: Option[String] = None, + datasetProject: Option[String] = None, description: Option[OpenDataContractStandardDescription] = None, driver: Option[String] = None, driverVersion: Option[String] = None, @@ -28,12 +30,14 @@ case class OpenDataContractStandard( tags: Option[Array[String]] = None, tenant: Option[String] = None, `type`: Option[String] = None, + schedulerAppName: Option[String] = None, server: Option[String] = None, slaDefaultColumn: Option[String] = None, slaProperties: Option[Array[OpenDataContractStandardServiceLevelAgreementProperty]] = None, sourceSystem: Option[String] = None, sourcePlatform: Option[String] = None, stakeholders: Option[Array[OpenDataContractStandardStakeholder]] = None, + systemInstance: Option[String] = None, username: Option[String] = None, userConsumptionMode: Option[String] = None, ) @@ -143,5 +147,5 @@ case class OpenDataContractStandardAuthoritativeDefinition( @JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandardCustomProperty( property: String, - value: String, + value: Any, ) \ No newline at end of file diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala index 22c33e1..5034c3b 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala @@ -1,34 +1,51 @@ package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model import com.fasterxml.jackson.annotation.JsonIgnoreProperties +import com.fasterxml.jackson.core.`type`.TypeReference +import com.fasterxml.jackson.module.scala.JsonScalaEnumeration +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model object KindEnum extends Enumeration { + type KindEnum = Value val DataContract = Value } - +class KindEnumCls extends TypeReference[KindEnum.type] object ApiVersionEnum extends Enumeration { - val `v3.0.0`, `v2.2.2`, `v2.2.1`, `v2.2.0` = Value + type ApiVersionEnum = Value + val `v3.0.0`: model.ApiVersionEnum.Value = Value(0, "v3.0.0") + val `v2.2.2`: model.ApiVersionEnum.Value = Value(1, "v2.2.2") + val `v2.2.1`: model.ApiVersionEnum.Value = Value(2, "v2.2.1") + val `v2.2.0`: model.ApiVersionEnum.Value = Value(3, "v2.2.0") } +class ApiVersionEnumCls extends TypeReference[ApiVersionEnum.type] object ServerTypeEnum extends Enumeration { - val api, athena, azure, bigquery, clickhouse, databricks, denodo, dremio, duckdb, glue, cloudsql, db2, informix, kafka, kinesis, local, mysql, oracle, postgresql, postgres, presto, pubsub, redshift, s3, sftp, snowflake, sqlserver, synapse, trino, vertica, custom = Value + type ServerTypeEnum = Value + val api, athena, azure, bigquery, clickhouse, databricks, denodo, dremio, duckdb, glue, cloudsql, db2, informix, + kafka, kinesis, local, mysql, oracle, postgresql, postgres, presto, pubsub, redshift, s3, sftp, snowflake, + sqlserver, synapse, trino, vertica, custom = Value } +class ServerTypeEnumCls extends TypeReference[ServerTypeEnum.type] object LogicalTypeEnum extends Enumeration { + type LogicalTypeEnum = Value val string, date, number, integer, `object`, array, boolean = Value } +class LogicalTypeEnumCls extends TypeReference[LogicalTypeEnum.type] object DataQualityTypeEnum extends Enumeration { + type DataQualityTypeEnum = Value val text, library, sql, custom = Value } +class DataQualityTypeEnumCls extends TypeReference[DataQualityTypeEnum.type] @JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandardV3( - apiVersion: ApiVersionEnum.Value, + @JsonScalaEnumeration(classOf[ApiVersionEnumCls]) apiVersion: ApiVersionEnum.ApiVersionEnum, id: String, - kind: KindEnum.Value, + @JsonScalaEnumeration(classOf[KindEnumCls]) kind: KindEnum.KindEnum, status: String, version: String, contractCreatedTs: Option[String] = None, @@ -63,7 +80,7 @@ case class OpenDataContractStandardV3( @JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandardServerV3( server: String, - `type`: ServerTypeEnum.Value, + @JsonScalaEnumeration(classOf[ServerTypeEnumCls]) `type`: ServerTypeEnum.ServerTypeEnum, description: Option[String], environment: Option[String], roles: Option[Array[OpenDataContractStandardRole]], @@ -90,7 +107,7 @@ case class OpenDataContractStandardSchemaV3( @JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandardElementV3( name: String, - logicalType: LogicalTypeEnum.Value, + @JsonScalaEnumeration(classOf[LogicalTypeEnumCls]) logicalType: LogicalTypeEnum.LogicalTypeEnum, physicalType: String, authoritativeDefinitions: Option[Array[OpenDataContractStandardAuthoritativeDefinition]] = None, businessName: Option[String] = None, @@ -136,7 +153,7 @@ case class OpenDataContractStandardLogicalTypeOptionsV3( @JsonIgnoreProperties(ignoreUnknown = true) case class OpenDataContractStandardDataQualityV3( - `type`: DataQualityTypeEnum.Value, + @JsonScalaEnumeration(classOf[DataQualityTypeEnumCls]) `type`: DataQualityTypeEnum.DataQualityTypeEnum, authoritativeDefinitions: Option[Array[OpenDataContractStandardAuthoritativeDefinition]] = None, businessImpact: Option[String] = None, code: Option[String] = None, diff --git a/app/src/test/resources/sample/metadata/odcs/full-example-v3.odcs.yaml b/app/src/test/resources/sample/metadata/odcs/full-example-v3.odcs.yaml new file mode 100644 index 0000000..3dfffe2 --- /dev/null +++ b/app/src/test/resources/sample/metadata/odcs/full-example-v3.odcs.yaml @@ -0,0 +1,233 @@ +# What's this data contract about? +domain: seller # Domain +dataProduct: my quantum # Data product name +version: 1.1.0 # Version (follows semantic versioning) +status: current +id: 53581432-6c55-4ba2-a65f-72344a91553a + +# Lots of information +description: + purpose: Views built on top of the seller tables. + limitations: Data based on seller perspective, no buyer information + usage: Predict sales over time +tenant: ClimateQuantumInc + +kind: DataContract +apiVersion: v3.0.0 # Standard version (follows semantic versioning) + +# Infrastructure & servers +servers: + - server: my-postgres + type: postgres + host: localhost + port: 5432 + database: pypl-edw + schema: pp_access_views + +# Dataset, schema and quality +schema: + - name: tbl + physicalName: tbl_1 + physicalType: table + description: Provides core payment metrics + authoritativeDefinitions: + - url: https://catalog.data.gov/dataset/air-quality + type: businessDefinition + - url: https://youtu.be/jbY1BKFj9ec + type: videoTutorial + tags: [ ] + dataGranularityDescription: Aggregation on columns txn_ref_dt, pmt_txn_id + properties: + - name: txn_ref_dt + primaryKey: false + primaryKeyPosition: -1 + businessName: transaction reference date + logicalType: date + physicalType: date + required: false + description: Reference date for transaction + partitioned: true + partitionKeyPosition: 1 + criticalDataElement: false + tags: [ ] + classification: public + transformSourceObjects: + - table_name_1 + - table_name_2 + - table_name_3 + transformLogic: sel t1.txn_dt as txn_ref_dt from table_name_1 as t1, table_name_2 as t2, table_name_3 as t3 where t1.txn_dt=date-3 + transformDescription: defines the logic in business terms; logic for dummies + examples: + - "2022-10-03" + - "2020-01-28" + customProperties: + - property: anonymizationStrategy + value: none + - name: rcvr_id + primaryKey: true + primaryKeyPosition: 1 + businessName: receiver id + logicalType: string + physicalType: varchar(18) + required: false + description: A description for column rcvr_id. + partitioned: false + partitionKeyPosition: -1 + criticalDataElement: false + tags: [ ] + classification: restricted + - name: rcvr_cntry_code + primaryKey: false + primaryKeyPosition: -1 + businessName: receiver country code + logicalType: string + physicalType: varchar(2) + required: false + description: Country code + partitioned: false + partitionKeyPosition: -1 + criticalDataElement: false + tags: [ ] + classification: public + authoritativeDefinitions: + - url: https://collibra.com/asset/742b358f-71a5-4ab1-bda4-dcdba9418c25 + type: businessDefinition + - url: https://github.com/myorg/myrepo + type: transformationImplementation + - url: jdbc:postgresql://localhost:5432/adventureworks/tbl_1/rcvr_cntry_code + type: implementation + encryptedName: rcvr_cntry_code_encrypted + quality: + - rule: nullCheck + description: column should not contain null values + dimension: completeness # dropdown 7 values + type: library + severity: error + businessImpact: operational + schedule: 0 20 * * * + scheduler: cron + customProperties: + - property: FIELD_NAME + value: + - property: COMPARE_TO + value: + - property: COMPARISON_TYPE + value: Greater than + quality: + - rule: countCheck + type: library + description: Ensure row count is within expected volume range + dimension: completeness + method: reconciliation + severity: error + businessImpact: operational + schedule: 0 20 * * * + scheduler: cron + customProperties: + - property: business-key + value: + - txn_ref_dt + - rcvr_id + + +# Pricing +price: + priceAmount: 9.95 + priceCurrency: USD + priceUnit: megabyte + + +# Team +team: + - username: ceastwood + role: Data Scientist + dateIn: "2022-08-02" + dateOut: "2022-10-01" + replacedByUsername: mhopper + - username: mhopper + role: Data Scientist + dateIn: "2022-10-01" + - username: daustin + role: Owner + comment: Keeper of the grail + dateIn: "2022-10-01" + + +# Roles +roles: + - role: microstrategy_user_opr + access: read + firstLevelApprovers: Reporting Manager + secondLevelApprovers: 'mandolorian' + - role: bq_queryman_user_opr + access: read + firstLevelApprovers: Reporting Manager + secondLevelApprovers: na + - role: risk_data_access_opr + access: read + firstLevelApprovers: Reporting Manager + secondLevelApprovers: 'dathvador' + - role: bq_unica_user_opr + access: write + firstLevelApprovers: Reporting Manager + secondLevelApprovers: 'mickey' + +# SLA +slaDefaultElement: tab1.txn_ref_dt +slaProperties: + - property: latency # Property, see list of values in DP QoS + value: 4 + unit: d # d, day, days for days; y, yr, years for years + element: tab1.txn_ref_dt # This would not be needed as it is the same table.column as the default one + - property: generalAvailability + value: "2022-05-12T09:30:10-08:00" + - property: endOfSupport + value: "2032-05-12T09:30:10-08:00" + - property: endOfLife + value: "2042-05-12T09:30:10-08:00" + - property: retention + value: 3 + unit: y + element: tab1.txn_ref_dt + - property: frequency + value: 1 + valueExt: 1 + unit: d + element: tab1.txn_ref_dt + - property: timeOfAvailability + value: 09:00-08:00 + element: tab1.txn_ref_dt + driver: regulatory # Describes the importance of the SLA: [regulatory|analytics|operational|...] + - property: timeOfAvailability + value: 08:00-08:00 + element: tab1.txn_ref_dt + driver: analytics + + +# Support +support: + - channel: '#product-help' # Simple Slack communication channel + tool: slack + url: https://aidaug.slack.com/archives/C05UZRSBKLY + - channel: datacontract-ann # Simple distribution list + tool: email + url: mailto:datacontract-ann@bitol.io + - channel: Feedback # Product Feedback + description: General Product Feedback (Public) + url: https://product-feedback.com + +# Tags +tags: + - transactions + + +# Custom properties +customProperties: + - property: refRulesetName + value: gcsc.ruleset.name + - property: somePropertyName + value: property.value + - property: dataprocClusterName # Used for specific applications like Elevate + value: [ cluster name ] + +contractCreatedTs: "2022-11-15T02:59:43+00:00" diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadataTest.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadataTest.scala index d0c9f4e..404a432 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadataTest.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadataTest.scala @@ -1,6 +1,7 @@ package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard -import io.github.datacatering.datacaterer.api.model.Constants.{CLUSTERING_POSITION, DATA_CONTRACT_FILE, ENABLED_NULL, FIELD_DATA_TYPE, FORMAT, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, METADATA_IDENTIFIER, PASSWORD, PRIMARY_KEY_POSITION, URL, USERNAME} +import io.github.datacatering.datacaterer.api.model.Constants.{CLUSTERING_POSITION, DATA_CONTRACT_FILE, ENABLED_NULL, FIELD_DATA_TYPE, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, METADATA_IDENTIFIER, PRIMARY_KEY_POSITION} +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.SubDataSourceMetadata import io.github.datacatering.datacaterer.core.util.SparkSuite import org.junit.runner.RunWith import org.scalatestplus.junit.JUnitRunner @@ -13,13 +14,23 @@ class OpenDataContractStandardDataSourceMetadataTest extends SparkSuite { val odcsMetadata = OpenDataContractStandardDataSourceMetadata("odcs", "parquet", connectionConfig) val result = odcsMetadata.getSubDataSourcesMetadata + validateResult(connectionConfig, result) + } + + test("Can convert ODCS v3.0.0 file to column metadata") { + val connectionConfig = Map(DATA_CONTRACT_FILE -> "src/test/resources/sample/metadata/odcs/full-example-v3.odcs.yaml") + val odcsMetadata = OpenDataContractStandardDataSourceMetadata("odcs", "parquet", connectionConfig) + val result = odcsMetadata.getSubDataSourcesMetadata + + validateResult(connectionConfig, result, false) + } + + private def validateResult( + connectionConfig: Map[String, String], + result: Array[SubDataSourceMetadata], + isVersion2: Boolean = true + ) = { assertResult(1)(result.length) - val expectedReadOptions = Map( - URL -> "localhost:5432", - USERNAME -> "${env.username}", - PASSWORD -> "${env.password}", - FORMAT -> "csv", - ) connectionConfig.foreach(kv => assert(result.head.readOptions(kv._1) == kv._2)) assertResult(true)(result.head.readOptions.contains(METADATA_IDENTIFIER)) assertResult(true)(result.head.optFieldMetadata.isDefined) @@ -28,42 +39,41 @@ class OpenDataContractStandardDataSourceMetadataTest extends SparkSuite { assertResult(true)(resultCols.exists(_.field == "txn_ref_dt")) val txnDateCol = resultCols.filter(_.field == "txn_ref_dt").head + val txnCluster = if (isVersion2) Map(CLUSTERING_POSITION -> "-1") else Map() val expectedTxnDateMetadata = Map( IS_PRIMARY_KEY -> "false", IS_NULLABLE -> "false", ENABLED_NULL -> "false", IS_UNIQUE -> "false", PRIMARY_KEY_POSITION -> "-1", - CLUSTERING_POSITION -> "-1", FIELD_DATA_TYPE -> "date" - ) + ) ++ txnCluster assertResult(expectedTxnDateMetadata)(txnDateCol.metadata) assertResult(true)(resultCols.exists(_.field == "rcvr_id")) val rcvrIdCol = resultCols.filter(_.field == "rcvr_id").head + val rcvrIdCluster = if (isVersion2) Map(CLUSTERING_POSITION -> "1") else Map() val expectedRcvrIdMetadata = Map( IS_PRIMARY_KEY -> "true", IS_NULLABLE -> "false", ENABLED_NULL -> "false", IS_UNIQUE -> "false", PRIMARY_KEY_POSITION -> "1", - CLUSTERING_POSITION -> "1", FIELD_DATA_TYPE -> "string" - ) + ) ++ rcvrIdCluster assertResult(expectedRcvrIdMetadata)(rcvrIdCol.metadata) assertResult(true)(resultCols.exists(_.field == "rcvr_cntry_code")) val countryCodeCol = resultCols.filter(_.field == "rcvr_cntry_code").head + val countryCodeCluster = if (isVersion2) Map(CLUSTERING_POSITION -> "-1") else Map() val expectedCountryCodeMetadata = Map( IS_PRIMARY_KEY -> "false", IS_NULLABLE -> "false", ENABLED_NULL -> "false", IS_UNIQUE -> "false", PRIMARY_KEY_POSITION -> "-1", - CLUSTERING_POSITION -> "-1", FIELD_DATA_TYPE -> "string" - ) + ) ++ countryCodeCluster assertResult(expectedCountryCodeMetadata)(countryCodeCol.metadata) } - } diff --git a/gradle.properties b/gradle.properties index de7e51c..18550c9 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ groupId=io.github.data-catering -version=0.12.1 +version=0.12.2 scalaVersion=2.12 scalaSpecificVersion=2.12.19