From 8dfd8fbbaf5bdeff787294c7f0823126423de05c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 24 Feb 2025 11:57:51 -0800 Subject: [PATCH] feat(ingest): migrate Cassandra source to new SDK (#12695) --- metadata-ingestion/scripts/avro_codegen.py | 2 +- .../src/datahub/ingestion/run/pipeline.py | 5 + .../ingestion/source/cassandra/cassandra.py | 385 ++- .../source/cassandra/cassandra_api.py | 15 +- .../src/datahub/sdk/__init__.py | 41 +- metadata-ingestion/src/datahub/sdk/_entity.py | 19 +- .../src/datahub/sdk/container.py | 4 +- metadata-ingestion/src/datahub/sdk/dataset.py | 8 +- .../cassandra/cassandra_mcps_golden.json | 2186 +++++++++-------- .../integration/cassandra/docker-compose.yml | 3 +- .../integration/cassandra/test_cassandra.py | 28 +- 11 files changed, 1435 insertions(+), 1261 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 7e75cba9833810..2fe2729349944b 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -714,7 +714,7 @@ def from_key_aspect(cls, key_aspect: "{key_aspect_class}") -> "{class_name}": code += f""" @property def {field_name(field)}(self) -> {field_type(field)}: - return self.entity_ids[{i}] + return self._entity_ids[{i}] """ return code diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index ea266f67a9c3d7..e43693f8ac9d73 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -43,6 +43,7 @@ SystemMetadataTransformer, ) from datahub.ingestion.transformer.transform_registry import transform_registry +from datahub.sdk._attribution import KnownAttribution, change_default_attribution from datahub.telemetry import stats from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities._custom_package_loader import model_version_name @@ -410,6 +411,10 @@ def run(self) -> None: ) ) + self.exit_stack.enter_context( + change_default_attribution(KnownAttribution.INGESTION) + ) + self.final_status = PipelineStatus.UNKNOWN self._notify_reporters_on_ingestion_start() callback = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index 062c64d45767fc..9966d333fdc17d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -1,19 +1,14 @@ import dataclasses import json import logging -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Union from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, make_schema_field_urn, ) -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, - add_dataset_to_container, - gen_containers, ) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -31,6 +26,7 @@ CassandraColumn, CassandraEntities, CassandraKeyspace, + CassandraSharedDatasetFields, CassandraTable, CassandraView, ) @@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass from datahub.metadata.com.linkedin.pegasus2avro.schema import ( SchemaField, - SchemaMetadata, ) from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, DatasetLineageTypeClass, - DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageDownstreamTypeClass, FineGrainedLineageUpstreamTypeClass, - OtherSchemaClass, - SubTypesClass, UpstreamClass, UpstreamLineageClass, ViewPropertiesClass, ) +from datahub.sdk._entity import Entity +from datahub.sdk.container import Container +from datahub.sdk.dataset import Dataset logger = logging.getLogger(__name__) @@ -133,6 +126,13 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: + for metadata in self._get_metadata(): + if isinstance(metadata, MetadataWorkUnit): + yield metadata + else: + yield from metadata.as_workunits() + + def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: if not self.cassandra_api.authenticate(): return keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces() @@ -145,7 +145,7 @@ def get_workunits_internal( self.report.report_dropped(keyspace_name) continue - yield from self._generate_keyspace_container(keyspace) + yield self._generate_keyspace_container(keyspace) try: yield from self._extract_tables_from_keyspace(keyspace_name) @@ -170,21 +170,20 @@ def get_workunits_internal( if self.config.is_profiling_enabled(): yield from self.profiler.get_workunits(self.cassandra_data) - def _generate_keyspace_container( - self, keyspace: CassandraKeyspace - ) -> Iterable[MetadataWorkUnit]: + def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container: keyspace_container_key = self._generate_keyspace_container_key( keyspace.keyspace_name ) - yield from gen_containers( - container_key=keyspace_container_key, - name=keyspace.keyspace_name, + + return Container( + keyspace_container_key, + display_name=keyspace.keyspace_name, qualified_name=keyspace.keyspace_name, + subtype=DatasetContainerSubTypes.KEYSPACE, extra_properties={ "durable_writes": str(keyspace.durable_writes), "replication": json.dumps(keyspace.replication), }, - sub_types=[DatasetContainerSubTypes.KEYSPACE], ) def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey: @@ -196,105 +195,55 @@ def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey: ) # get all tables for a given keyspace, iterate over them to extract column metadata - def _extract_tables_from_keyspace( - self, keyspace_name: str - ) -> Iterable[MetadataWorkUnit]: + def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]: self.cassandra_data.keyspaces.append(keyspace_name) tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name) for table in tables: - # define the dataset urn for this table to be used downstream - table_name: str = table.table_name - dataset_name: str = f"{keyspace_name}.{table_name}" - - if not self.config.table_pattern.allowed(dataset_name): - self.report.report_dropped(dataset_name) - continue - - self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name) - self.report.report_entity_scanned(dataset_name, ent_type="Table") - - dataset_urn = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_name, - env=self.config.env, - platform_instance=self.config.platform_instance, + dataset = self._generate_table(keyspace_name, table) + if dataset: + yield dataset + + def _generate_table( + self, keyspace_name: str, table: CassandraTable + ) -> Optional[Dataset]: + table_name: str = table.table_name + dataset_name: str = f"{keyspace_name}.{table_name}" + + self.report.report_entity_scanned(dataset_name, ent_type="Table") + if not self.config.table_pattern.allowed(dataset_name): + self.report.report_dropped(dataset_name) + return None + + self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name) + + schema_fields = None + try: + schema_fields = self._extract_columns_from_table(keyspace_name, table_name) + except Exception as e: + self.report.failure( + message="Failed to extract columns from table", + context=dataset_name, + exc=e, ) - # 1. Extract columns from table, then construct and emit the schemaMetadata aspect. - try: - yield from self._extract_columns_from_table( - keyspace_name, table_name, dataset_urn - ) - except Exception as e: - self.report.failure( - message="Failed to extract columns from table", - context=table_name, - exc=e, - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=StatusClass(removed=False), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypesClass( - typeNames=[ - DatasetSubTypes.TABLE, - ] - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DatasetPropertiesClass( - name=table_name, - qualifiedName=f"{keyspace_name}.{table_name}", - description=table.comment, - customProperties={ - "bloom_filter_fp_chance": str(table.bloom_filter_fp_chance), - "caching": json.dumps(table.caching), - "compaction": json.dumps(table.compaction), - "compression": json.dumps(table.compression), - "crc_check_chance": str(table.crc_check_chance), - "dclocal_read_repair_chance": str( - table.dclocal_read_repair_chance - ), - "default_time_to_live": str(table.default_time_to_live), - "extensions": json.dumps(table.extensions), - "gc_grace_seconds": str(table.gc_grace_seconds), - "max_index_interval": str(table.max_index_interval), - "min_index_interval": str(table.min_index_interval), - "memtable_flush_period_in_ms": str( - table.memtable_flush_period_in_ms - ), - "read_repair_chance": str(table.read_repair_chance), - "speculative_retry": str(table.speculative_retry), - }, - ), - ).as_workunit() - - yield from add_dataset_to_container( - container_key=self._generate_keyspace_container_key(keyspace_name), - dataset_urn=dataset_urn, - ) - - if self.config.platform_instance: - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DataPlatformInstanceClass( - platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn( - self.platform, self.config.platform_instance - ), - ), - ).as_workunit() + return Dataset( + platform=self.platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + subtype=DatasetSubTypes.TABLE, + parent_container=self._generate_keyspace_container_key(keyspace_name), + schema=schema_fields, + display_name=table_name, + qualified_name=dataset_name, + description=table.comment, + custom_properties=self._get_dataset_custom_props(table), + ) # get all columns for a given table, iterate over them to extract column metadata def _extract_columns_from_table( - self, keyspace_name: str, table_name: str, dataset_urn: str - ) -> Iterable[MetadataWorkUnit]: + self, keyspace_name: str, table_name: str + ) -> Optional[List[SchemaField]]: column_infos: List[CassandraColumn] = self.cassandra_api.get_columns( keyspace_name, table_name ) @@ -305,147 +254,117 @@ def _extract_columns_from_table( self.report.report_warning( message="Table has no columns, skipping", context=table_name ) - return + return None + # Tricky: we also save the column info to a global store. jsonable_column_infos: List[Dict[str, Any]] = [] for column in column_infos: self.cassandra_data.columns.setdefault(table_name, []).append(column) jsonable_column_infos.append(dataclasses.asdict(column)) - schema_metadata: SchemaMetadata = SchemaMetadata( - schemaName=table_name, - platform=make_data_platform_urn(self.platform), - version=0, - hash="", - platformSchema=OtherSchemaClass( - rawSchema=json.dumps(jsonable_column_infos) - ), - fields=schema_fields, - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=schema_metadata, - ).as_workunit() + return schema_fields - def _extract_views_from_keyspace( - self, keyspace_name: str - ) -> Iterable[MetadataWorkUnit]: + def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]: views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name) for view in views: - view_name: str = view.view_name - dataset_name: str = f"{keyspace_name}.{view_name}" - self.report.report_entity_scanned(dataset_name) - dataset_urn: str = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_name, - env=self.config.env, - platform_instance=self.config.platform_instance, + dataset = self._generate_view(keyspace_name, view) + if dataset: + yield dataset + + def _generate_view( + self, keyspace_name: str, view: CassandraView + ) -> Optional[Dataset]: + view_name: str = view.view_name + dataset_name: str = f"{keyspace_name}.{view_name}" + + self.report.report_entity_scanned(dataset_name, ent_type="View") + if not self.config.table_pattern.allowed(dataset_name): + # TODO: Maybe add a view_pattern instead of reusing table_pattern? + self.report.report_dropped(dataset_name) + return None + + schema_fields = None + try: + schema_fields = self._extract_columns_from_table(keyspace_name, view_name) + except Exception as e: + self.report.failure( + message="Failed to extract columns from views", + context=view_name, + exc=e, ) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=StatusClass(removed=False), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypesClass( - typeNames=[ - DatasetSubTypes.VIEW, - ] - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=ViewPropertiesClass( + dataset = Dataset( + platform=self.platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + subtype=DatasetSubTypes.VIEW, + parent_container=self._generate_keyspace_container_key(keyspace_name), + schema=schema_fields, + display_name=view_name, + qualified_name=dataset_name, + description=view.comment, + custom_properties=self._get_dataset_custom_props(view), + extra_aspects=[ + ViewPropertiesClass( materialized=True, viewLogic=view.where_clause, # Use the WHERE clause as view logic viewLanguage="CQL", # Use "CQL" as the language ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DatasetPropertiesClass( - name=view_name, - qualifiedName=f"{keyspace_name}.{view_name}", - description=view.comment, - customProperties={ - "bloom_filter_fp_chance": str(view.bloom_filter_fp_chance), - "caching": json.dumps(view.caching), - "compaction": json.dumps(view.compaction), - "compression": json.dumps(view.compression), - "crc_check_chance": str(view.crc_check_chance), - "include_all_columns": str(view.include_all_columns), - "dclocal_read_repair_chance": str( - view.dclocal_read_repair_chance - ), - "default_time_to_live": str(view.default_time_to_live), - "extensions": json.dumps(view.extensions), - "gc_grace_seconds": str(view.gc_grace_seconds), - "max_index_interval": str(view.max_index_interval), - "min_index_interval": str(view.min_index_interval), - "memtable_flush_period_in_ms": str( - view.memtable_flush_period_in_ms - ), - "read_repair_chance": str(view.read_repair_chance), - "speculative_retry": str(view.speculative_retry), - }, - ), - ).as_workunit() + ], + ) - try: - yield from self._extract_columns_from_table( - keyspace_name, view_name, dataset_urn - ) - except Exception as e: - self.report.failure( - message="Failed to extract columns from views", - context=view_name, - exc=e, + # Construct and emit lineage off of 'base_table_name' + # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name + upstream_urn: str = make_dataset_urn_with_platform_instance( + platform=self.platform, + name=f"{keyspace_name}.{view.base_table_name}", + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource( + view_name, str(dataset.urn), upstream_urn + ) + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=upstream_urn, + type=DatasetLineageTypeClass.VIEW, ) + ], + fineGrainedLineages=fineGrainedLineages, + ) - # Construct and emit lineage off of 'base_table_name' - # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name - upstream_urn: str = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=f"{keyspace_name}.{view.table_name}", - env=self.config.env, - platform_instance=self.config.platform_instance, - ) - fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource( - view_name, dataset_urn, upstream_urn - ) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=UpstreamLineageClass( - upstreams=[ - UpstreamClass( - dataset=upstream_urn, - type=DatasetLineageTypeClass.VIEW, - ) - ], - fineGrainedLineages=fineGrainedLineages, - ), - ).as_workunit() - - yield from add_dataset_to_container( - container_key=self._generate_keyspace_container_key(keyspace_name), - dataset_urn=dataset_urn, + dataset.set_upstreams(upstream_lineage) + + return dataset + + def _get_dataset_custom_props( + self, dataset: CassandraSharedDatasetFields + ) -> Dict[str, str]: + props = { + "bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance), + "caching": json.dumps(dataset.caching), + "compaction": json.dumps(dataset.compaction), + "compression": json.dumps(dataset.compression), + "crc_check_chance": str(dataset.crc_check_chance), + "dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance), + "default_time_to_live": str(dataset.default_time_to_live), + "extensions": json.dumps(dataset.extensions), + "gc_grace_seconds": str(dataset.gc_grace_seconds), + "max_index_interval": str(dataset.max_index_interval), + "min_index_interval": str(dataset.min_index_interval), + "memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms), + "read_repair_chance": str(dataset.read_repair_chance), + "speculative_retry": str(dataset.speculative_retry), + } + if isinstance(dataset, CassandraView): + props.update( + { + "include_all_columns": str(dataset.include_all_columns), + } ) - - if self.config.platform_instance: - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DataPlatformInstanceClass( - platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn( - self.platform, self.config.platform_instance - ), - ), - ).as_workunit() + return props def get_upstream_fields_of_field_in_datasource( self, table_name: str, dataset_urn: str, upstream_urn: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py index 4cf0613762aab8..c1a813eb6ee349 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py @@ -23,9 +23,9 @@ class CassandraKeyspace: @dataclass -class CassandraTable: +class CassandraSharedDatasetFields: keyspace_name: str - table_name: str + bloom_filter_fp_chance: Optional[float] caching: Optional[Dict[str, str]] comment: Optional[str] @@ -43,6 +43,11 @@ class CassandraTable: speculative_retry: Optional[str] +@dataclass +class CassandraTable(CassandraSharedDatasetFields): + table_name: str + + @dataclass class CassandraColumn: keyspace_name: str @@ -55,8 +60,10 @@ class CassandraColumn: @dataclass -class CassandraView(CassandraTable): +class CassandraView(CassandraSharedDatasetFields): view_name: str + + base_table_name: str include_all_columns: Optional[bool] where_clause: str = "" @@ -261,7 +268,7 @@ def get_views(self, keyspace_name: str) -> List[CassandraView]: views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name]) view_list = [ CassandraView( - table_name=row.base_table_name, + base_table_name=row.base_table_name, keyspace_name=row.keyspace_name, view_name=row.view_name, bloom_filter_fp_chance=row.bloom_filter_fp_chance, diff --git a/metadata-ingestion/src/datahub/sdk/__init__.py b/metadata-ingestion/src/datahub/sdk/__init__.py index 54bd18c3230476..ec7ecf4ce06880 100644 --- a/metadata-ingestion/src/datahub/sdk/__init__.py +++ b/metadata-ingestion/src/datahub/sdk/__init__.py @@ -1,7 +1,7 @@ -import warnings +import types import datahub.metadata.schema_classes as models -from datahub.errors import ExperimentalWarning, SdkUsageError +from datahub.errors import SdkUsageError from datahub.ingestion.graph.config import DatahubClientConfig from datahub.metadata.urns import ( ChartUrn, @@ -21,13 +21,30 @@ from datahub.sdk.dataset import Dataset from datahub.sdk.main_client import DataHubClient -warnings.warn( - "The new datahub SDK (e.g. datahub.sdk.*) is experimental. " - "Our typical backwards-compatibility and stability guarantees do not apply to this code. " - "When it's promoted to stable, the import path will change " - "from `from datahub.sdk import ...` to `from datahub import ...`.", - ExperimentalWarning, - stacklevel=2, -) -del warnings -del ExperimentalWarning +# We want to print out the warning if people do `from datahub.sdk import X`. +# But we don't want to print out warnings if they're doing a more direct +# import like `from datahub.sdk.container import Container`, since that's +# what our internal code does. +_vars = {} +for _name, _value in list(locals().items()): + if not _name.startswith("_") and ( + _name == "models" or not isinstance(_value, types.ModuleType) + ): + _vars[_name] = _value + del locals()[_name] + + +def __getattr__(name): + import warnings + + from datahub.errors import ExperimentalWarning + + warnings.warn( + "The new datahub SDK (e.g. datahub.sdk.*) is experimental. " + "Our typical backwards-compatibility and stability guarantees do not apply to this code. " + "When it's promoted to stable, the import path will change " + "from `from datahub.sdk import ...` to `from datahub import ...`.", + ExperimentalWarning, + stacklevel=2, + ) + return _vars[name] diff --git a/metadata-ingestion/src/datahub/sdk/_entity.py b/metadata-ingestion/src/datahub/sdk/_entity.py index 071affc27eff26..f5887e4e0fb803 100644 --- a/metadata-ingestion/src/datahub/sdk/_entity.py +++ b/metadata-ingestion/src/datahub/sdk/_entity.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import abc -from typing import List, Optional, Type, Union +from typing import TYPE_CHECKING, List, Optional, Type, Union from typing_extensions import Self @@ -10,6 +12,12 @@ from datahub.metadata.urns import Urn from datahub.utilities.urns._urn_base import _SpecificUrn +if TYPE_CHECKING: + from datahub.ingestion.api.workunit import MetadataWorkUnit + + +ExtraAspectsType = Union[None, List[AspectTypeVar]] + class Entity: __slots__ = ("_urn", "_prev_aspects", "_aspects") @@ -87,5 +95,14 @@ def _as_mcps( ) return mcps + def as_workunits(self) -> List[MetadataWorkUnit]: + return [mcp.as_workunit() for mcp in self._as_mcps()] + + def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None: + # TODO: Add validation to ensure that an "extra aspect" does not conflict + # with / get overridden by a standard aspect. + for aspect in extra_aspects or []: + self._set_aspect(aspect) + def __repr__(self) -> str: return f"{self.__class__.__name__}('{self.urn}')" diff --git a/metadata-ingestion/src/datahub/sdk/container.py b/metadata-ingestion/src/datahub/sdk/container.py index e9ecc1989e995e..ec4d6521c60887 100644 --- a/metadata-ingestion/src/datahub/sdk/container.py +++ b/metadata-ingestion/src/datahub/sdk/container.py @@ -16,7 +16,7 @@ ContainerUrn, Urn, ) -from datahub.sdk._entity import Entity +from datahub.sdk._entity import Entity, ExtraAspectsType from datahub.sdk._shared import ( DomainInputType, HasContainer, @@ -74,6 +74,7 @@ def __init__( tags: Optional[TagsInputType] = None, terms: Optional[TermsInputType] = None, domain: Optional[DomainInputType] = None, + extra_aspects: ExtraAspectsType = None, ): # Hack: while the type annotations say container_key is always a ContainerKey, # we allow ContainerUrn to make the graph-based constructor work. @@ -82,6 +83,7 @@ def __init__( else: urn = ContainerUrn.from_string(container_key.as_urn()) super().__init__(urn) + self._set_extra_aspects(extra_aspects) # This needs to come first to ensure that the display name is registered. self._ensure_container_props(name=display_name) diff --git a/metadata-ingestion/src/datahub/sdk/dataset.py b/metadata-ingestion/src/datahub/sdk/dataset.py index bb7306a1acc1c5..6d241627e58d19 100644 --- a/metadata-ingestion/src/datahub/sdk/dataset.py +++ b/metadata-ingestion/src/datahub/sdk/dataset.py @@ -2,7 +2,7 @@ import warnings from datetime import datetime -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Sequence, Tuple, Type, Union from typing_extensions import Self, TypeAlias, assert_never @@ -18,7 +18,7 @@ from datahub.ingestion.source.sql.sql_types import resolve_sql_type from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn from datahub.sdk._attribution import is_ingestion_attribution -from datahub.sdk._entity import Entity +from datahub.sdk._entity import Entity, ExtraAspectsType from datahub.sdk._shared import ( DatasetUrnOrStr, DomainInputType, @@ -47,7 +47,7 @@ models.SchemaFieldClass, ] SchemaFieldsInputType: TypeAlias = Union[ - List[SchemaFieldInputType], + Sequence[SchemaFieldInputType], models.SchemaMetadataClass, ] @@ -457,6 +457,7 @@ def __init__( terms: Optional[TermsInputType] = None, # TODO structured_properties domain: Optional[DomainInputType] = None, + extra_aspects: ExtraAspectsType = None, # Dataset-specific aspects. schema: Optional[SchemaFieldsInputType] = None, upstreams: Optional[models.UpstreamLineageClass] = None, @@ -468,6 +469,7 @@ def __init__( env=env, ) super().__init__(urn) + self._set_extra_aspects(extra_aspects) self._set_platform_instance(urn.platform, platform_instance) diff --git a/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json b/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json index 1823a218ada2e0..fb0bca406b9256 100644 --- a/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json +++ b/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json @@ -1,64 +1,66 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "cassandra", + "instance": "dev_instance", "env": "PROD", - "keyspace": "cass_test_1", + "keyspace": "example_keyspace", "durable_writes": "True", "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" }, - "name": "cass_test_1", - "qualifiedName": "cass_test_1", + "name": "example_keyspace", + "qualifiedName": "example_keyspace", "env": "PROD" } }, "systemMetadata": { - "lastObserved": 1731579516869, + "lastObserved": 1739924675276, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "status", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731309924399, + "lastObserved": 1739924675277, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "removed": false + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924399, + "lastObserved": 1739924675278, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -69,174 +71,40 @@ } }, "systemMetadata": { - "lastObserved": 1731309924400, + "lastObserved": 1739924675280, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1731309924400, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "schemaMetadata", - "aspect": { - "json": { - "schemaName": "information", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"details\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"last_updated\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"person_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } - }, - "fields": [ - { - "fieldPath": "details", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_updated", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.TimeType": {} - } - }, - "nativeDataType": "timestamp", - "recursive": false, - "isPartOfKey": false - }, + "path": [ { - "fieldPath": "person_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } ] } }, "systemMetadata": { - "lastObserved": 1731591019538, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731309924405, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] - } - }, - "systemMetadata": { - "lastObserved": 1731309924405, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "information", - "qualifiedName": "cass_test_1.information", - "description": "", - "tags": [] - } - }, - "systemMetadata": { - "lastObserved": 1731591019540, + "lastObserved": 1739924675281, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "all_data_types", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -249,9 +117,7 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"bigint_column\", \"type\": \"bigint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"blob_column\", \"type\": \"blob\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"boolean_column\", \"type\": \"boolean\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"date_column\", \"type\": \"date\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"decimal_column\", \"type\": \"decimal\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"double_column\", \"type\": \"double\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"float_column\", \"type\": \"float\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_list_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_map_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_set_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"inet_column\", \"type\": \"inet\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"int_column\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"list_column\", \"type\": \"list\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"map_column\", \"type\": \"map\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"set_column\", \"type\": \"set\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"smallint_column\", \"type\": \"smallint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"text_column\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"time_column\", \"type\": \"time\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"timestamp_column\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"timeuuid_column\", \"type\": \"timeuuid\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"tinyint_column\", \"type\": \"tinyint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"tuple_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"uuid_column\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"varchar_column\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"varint_column\", \"type\": \"varint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { @@ -582,100 +448,14 @@ } }, "systemMetadata": { - "lastObserved": 1731591019435, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310097192, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", - "changeType": "UPSERT", - "aspectName": "schemaMetadata", - "aspect": { - "json": { - "schemaName": "people", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"email\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"name\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"person_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } - }, - "fields": [ - { - "fieldPath": "email", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "name", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "person_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731591019563, + "lastObserved": 1739997601555, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -684,14 +464,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924412, + "lastObserved": 1739924675311, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -702,14 +482,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924412, + "lastObserved": 1739924675312, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -730,104 +510,135 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "people", - "qualifiedName": "cass_test_1.people", - "description": "", + "name": "all_data_types", + "qualifiedName": "example_keyspace.all_data_types", + "description": "Table containing all supported Cassandra data types, excluding counters", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019564, + "lastObserved": 1739924675313, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", - "urn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" - } - ] + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924406, + "lastObserved": 1739924675314, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924406, + "lastObserved": 1739924675315, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "platform": "cassandra", - "env": "PROD", - "keyspace": "cass_test_2", - "durable_writes": "True", - "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" - }, - "name": "cass_test_2", - "qualifiedName": "cass_test_2", - "env": "PROD" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] } }, "systemMetadata": { - "lastObserved": 1731579516849, + "lastObserved": 1739924675316, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "schemaMetadata", "aspect": { "json": { - "path": [ + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "counter_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "counter", + "recursive": false, + "isPartOfKey": false + }, { - "id": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", - "urn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", + "recursive": false, + "isPartOfKey": false } ] } }, "systemMetadata": { - "lastObserved": 1731309924413, + "lastObserved": 1739997601577, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -836,51 +647,130 @@ } }, "systemMetadata": { - "lastObserved": 1731309924420, + "lastObserved": 1739924675326, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675327, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "counter_table", + "qualifiedName": "example_keyspace.counter_table", + "description": "Separate table containing only counter column", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1739924675328, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924413, + "lastObserved": 1739924675330, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675331, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] } }, "systemMetadata": { - "lastObserved": 1731309924421, + "lastObserved": 1739924675332, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "tasks", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -893,25 +783,23 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"details\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"last_updated\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"status\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"task_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "details", + "fieldPath": "item_count", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "text", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "last_updated", + "fieldPath": "last_update_timestamp", "nullable": true, "type": { "type": { @@ -923,7 +811,7 @@ "isPartOfKey": false }, { - "fieldPath": "status", + "fieldPath": "userid", "nullable": true, "type": { "type": { @@ -933,31 +821,19 @@ "nativeDataType": "text", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "task_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false } ] } }, "systemMetadata": { - "lastObserved": 1731591019516, + "lastObserved": 1739997601596, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -966,14 +842,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924426, + "lastObserved": 1739924675342, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -984,14 +860,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924426, + "lastObserved": 1739924675343, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1012,131 +888,172 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "tasks", - "qualifiedName": "cass_test_2.tasks", + "name": "shopping_cart", + "qualifiedName": "example_keyspace.shopping_cart", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019518, + "lastObserved": 1739924675345, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "container", "aspect": { "json": { - "typeNames": [ - "Keyspace" - ] + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924421, + "lastObserved": 1739924675346, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924420, + "lastObserved": 1739924675347, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0381892d0717b54887d087eaafd95d2b", - "urn": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } ] } }, "systemMetadata": { - "lastObserved": 1731309924427, + "lastObserved": 1739924675348, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731309924427, + "lastObserved": 1739924675355, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "subTypes", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "typeNames": [ + "View" + ] } }, "systemMetadata": { - "lastObserved": 1731310097193, + "lastObserved": 1739924675356, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "viewProperties", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "materialized": true, + "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", + "viewLanguage": "CQL" + } + }, + "systemMetadata": { + "lastObserved": 1739924675357, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "example_view_1", + "qualifiedName": "example_keyspace.example_view_1", + "description": "Example view definition with id and ascii_column", + "tags": [] } }, "systemMetadata": { - "lastObserved": 1731310097193, + "lastObserved": 1739924675358, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "task_status", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1149,32 +1066,42 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_2\", \"table_name\": \"task_status\", \"column_name\": \"status\", \"type\": \"text\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"task_status\", \"column_name\": \"task_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "status", + "fieldPath": "ascii_column", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "text", + "nativeDataType": "ascii", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "task_id", + "fieldPath": "bigint_column", "nullable": true, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "int", + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", "recursive": false, "isPartOfKey": false } @@ -1182,70 +1109,432 @@ } }, "systemMetadata": { - "lastObserved": 1731591019525, + "lastObserved": 1739997601613, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", + "type": "VIEW" } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310097193, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),ascii_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),ascii_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),bigint_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),bigint_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675364, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + }, + "systemMetadata": { + "lastObserved": 1739924675367, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675367, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675368, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675376, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675377, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": true, + "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", + "viewLanguage": "CQL" + } + }, + "systemMetadata": { + "lastObserved": 1739924675378, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "all_data_types", - "qualifiedName": "example_keyspace.all_data_types", - "description": "Table containing all supported Cassandra data types, excluding counters", + "name": "example_view_2", + "qualifiedName": "example_keyspace.example_view_2", + "description": "Example view definition with id and ascii_column", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019440, + "lastObserved": 1739924675380, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "ascii_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "ascii", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "float_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739997601626, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),ascii_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),ascii_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),float_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),float_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675385, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + }, + "systemMetadata": { + "lastObserved": 1739924675387, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675388, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675389, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "cassandra", + "instance": "dev_instance", + "env": "PROD", + "keyspace": "cass_test_2", + "durable_writes": "True", + "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" + }, + "name": "cass_test_2", + "qualifiedName": "cass_test_2", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1739924675400, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1254,37 +1543,75 @@ } }, "systemMetadata": { - "lastObserved": 1731310097158, + "lastObserved": 1739924675402, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675402, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Keyspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675403, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } ] } }, "systemMetadata": { - "lastObserved": 1731310097161, + "lastObserved": 1739924675404, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "counter_table", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1297,32 +1624,54 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"counter_table\", \"column_name\": \"counter_column\", \"type\": \"counter\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"counter_table\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "counter_column", + "fieldPath": "details", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "counter", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "id", + "fieldPath": "last_updated", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "status", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "uuid", + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "task_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -1330,14 +1679,48 @@ } }, "systemMetadata": { - "lastObserved": 1731591019446, + "lastObserved": 1739997601663, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675415, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675416, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1348,7 +1731,6 @@ "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", "crc_check_chance": "1.0", - "include_all_columns": "False", "dclocal_read_repair_chance": "0.0", "default_time_to_live": "0", "extensions": "{}", @@ -1359,126 +1741,223 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "task_status", - "qualifiedName": "cass_test_2.task_status", + "name": "tasks", + "qualifiedName": "cass_test_2.tasks", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019524, + "lastObserved": 1739924675417, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "container", "aspect": { "json": { - "removed": false + "container": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" } }, "systemMetadata": { - "lastObserved": 1731310097198, + "lastObserved": 1739924675419, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "materialized": true, - "viewLogic": "status IS NOT NULL AND task_id IS NOT NULL", - "viewLanguage": "CQL" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731310097161, + "lastObserved": 1739924675420, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "urn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097163, + "lastObserved": 1739924675421, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0381892d0717b54887d087eaafd95d2b", - "urn": "urn:li:container:0381892d0717b54887d087eaafd95d2b" - } + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675428, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" ] } }, "systemMetadata": { - "lastObserved": 1731310097163, + "lastObserved": 1739924675429, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "materialized": true, + "viewLogic": "status IS NOT NULL AND task_id IS NOT NULL", + "viewLanguage": "CQL" } }, "systemMetadata": { - "lastObserved": 1731310097199, + "lastObserved": 1739924675430, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "datasetProperties", "aspect": { "json": { - "typeNames": [ - "Table" + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "task_status", + "qualifiedName": "cass_test_2.task_status", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1739924675431, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "status", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "task_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + } ] } }, "systemMetadata": { - "lastObserved": 1731310097198, + "lastObserved": 1739997601674, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -1489,7 +1968,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "type": "VIEW" } ], @@ -1497,22 +1976,22 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD),status)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD),status)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD),status)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD),status)" ], "confidenceScore": 1.0 }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD),task_id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD),task_id)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD),task_id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD),task_id)" ], "confidenceScore": 1.0 } @@ -1520,111 +1999,98 @@ } }, "systemMetadata": { - "lastObserved": 1731447296444, + "lastObserved": 1739924675436, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] + "container": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" } }, "systemMetadata": { - "lastObserved": 1731310097199, + "lastObserved": 1739924675440, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "counter_table", - "qualifiedName": "example_keyspace.counter_table", - "description": "Separate table containing only counter column", - "tags": [] + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731591019447, + "lastObserved": 1739924675441, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "urn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675442, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "cassandra", + "instance": "dev_instance", "env": "PROD", - "keyspace": "example_keyspace", + "keyspace": "cass_test_1", "durable_writes": "True", "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" }, - "name": "example_keyspace", - "qualifiedName": "example_keyspace", + "name": "cass_test_1", + "qualifiedName": "cass_test_1", "env": "PROD" } }, "systemMetadata": { - "lastObserved": 1731579516801, + "lastObserved": 1739924675452, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1633,14 +2099,31 @@ } }, "systemMetadata": { - "lastObserved": 1731310097185, + "lastObserved": 1739924675453, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675454, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1651,35 +2134,40 @@ } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675455, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675456, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "shopping_cart", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1692,25 +2180,23 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"item_count\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"last_update_timestamp\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"userid\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "item_count", + "fieldPath": "details", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "int", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "last_update_timestamp", + "fieldPath": "last_updated", "nullable": true, "type": { "type": { @@ -1722,14 +2208,14 @@ "isPartOfKey": false }, { - "fieldPath": "userid", + "fieldPath": "person_id", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "text", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -1737,74 +2223,30 @@ } }, "systemMetadata": { - "lastObserved": 1731591019453, + "lastObserved": 1739997601705, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "status", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),ascii_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),ascii_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),bigint_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),bigint_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),id)" - ], - "confidenceScore": 1.0 - } - ] + "removed": false } }, "systemMetadata": { - "lastObserved": 1731447296557, + "lastObserved": 1739924675467, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1815,48 +2257,14 @@ } }, "systemMetadata": { - "lastObserved": 1731410842611, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731410842610, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": true, - "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", - "viewLanguage": "CQL" - } - }, - "systemMetadata": { - "lastObserved": 1731310103458, + "lastObserved": 1739924675468, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1877,97 +2285,84 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "shopping_cart", - "qualifiedName": "example_keyspace.shopping_cart", + "name": "information", + "qualifiedName": "cass_test_1.information", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019455, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310103456, + "lastObserved": 1739924675469, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] + "container": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" } }, "systemMetadata": { - "lastObserved": 1731410842612, + "lastObserved": 1739924675470, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731410842611, + "lastObserved": 1739924675471, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "urn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" + } ] } }, "systemMetadata": { - "lastObserved": 1731310103457, + "lastObserved": 1739924675472, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "example_view_1", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1980,44 +2375,42 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"bigint_column\", \"type\": \"bigint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "ascii_column", + "fieldPath": "email", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "ascii", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "bigint_column", + "fieldPath": "name", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "bigint", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "id", + "fieldPath": "person_id", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "uuid", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -2025,51 +2418,48 @@ } }, "systemMetadata": { - "lastObserved": 1731591019464, + "lastObserved": 1739997601745, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731310103460, + "lastObserved": 1739924675511, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "subTypes", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } + "typeNames": [ + "Table" ] } }, "systemMetadata": { - "lastObserved": 1731310103461, + "lastObserved": 1739924675512, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -2080,7 +2470,6 @@ "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", "crc_check_chance": "1.0", - "include_all_columns": "False", "dclocal_read_repair_chance": "0.0", "default_time_to_live": "0", "extensions": "{}", @@ -2091,281 +2480,185 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "example_view_1", - "qualifiedName": "example_keyspace.example_view_1", - "description": "Example view definition with id and ascii_column", + "name": "people", + "qualifiedName": "cass_test_1.people", + "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019464, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310942175, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310942171, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310942172, + "lastObserved": 1739924675513, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "container", "aspect": { "json": { - "materialized": true, - "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", - "viewLanguage": "CQL" + "container": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" } }, "systemMetadata": { - "lastObserved": 1731310942172, + "lastObserved": 1739924675515, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "upstreamLineage", - "aspect": { - "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),ascii_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),ascii_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),float_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),float_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),id)" - ], - "confidenceScore": 1.0 - } - ] +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731447296594, + "lastObserved": 1739924675516, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "include_all_columns": "False", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "example_view_2", - "qualifiedName": "example_keyspace.example_view_2", - "description": "Example view definition with id and ascii_column", - "tags": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "urn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" + } + ] } }, "systemMetadata": { - "lastObserved": 1731591019474, + "lastObserved": 1739924675517, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", - "aspectName": "schemaMetadata", + "aspectName": "datasetProfile", "aspect": { "json": { - "schemaName": "example_view_2", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"float_column\", \"type\": \"float\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "timestampMillis": 1739924675506, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, - "fields": [ - { - "fieldPath": "ascii_column", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "ascii", - "recursive": false, - "isPartOfKey": false - }, + "rowCount": 0, + "columnCount": 2, + "fieldProfiles": [ { - "fieldPath": "float_column", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false + "fieldPath": "counter_column", + "nullCount": 0 }, { "fieldPath": "id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "uuid", - "recursive": false, - "isPartOfKey": false + "nullCount": 0 } ] } }, "systemMetadata": { - "lastObserved": 1731591019474, + "lastObserved": 1739924675535, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "datasetProfile", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "timestampMillis": 1739924675535, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "rowCount": 9, + "columnCount": 3, + "fieldProfiles": [ + { + "fieldPath": "item_count", + "uniqueCount": 5, + "nullCount": 4, + "min": "2", + "max": "100", + "mean": "46.4", + "median": "50.0", + "stdev": "38.44", + "sampleValues": [ + "5", + "100", + "75", + "2", + "50" + ] + }, + { + "fieldPath": "last_update_timestamp", + "uniqueCount": 9, + "nullCount": 0, + "min": "2024-11-01 00:00:00", + "max": "2024-11-09 00:00:00", + "sampleValues": [ + "2024-11-08 00:00:00", + "2024-11-06 00:00:00", + "2024-11-02 00:00:00", + "2024-11-03 00:00:00", + "2024-11-05 00:00:00" + ] + }, + { + "fieldPath": "userid", + "uniqueCount": 9, + "nullCount": 0, + "min": "1234", + "max": "9876", + "sampleValues": [ + "1240", + "1238", + "1234", + "1235", + "1237" + ] + } + ] } }, "systemMetadata": { - "lastObserved": 1731310942175, + "lastObserved": 1739924675551, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516915, + "timestampMillis": 1739924675549, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2485,87 +2778,19 @@ } }, "systemMetadata": { - "lastObserved": 1731579516925, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "json": { - "timestampMillis": 1731579516959, - "partitionSpec": { - "partition": "FULL_TABLE_SNAPSHOT", - "type": "FULL_TABLE" - }, - "rowCount": 0, - "columnCount": 3, - "fieldProfiles": [ - { - "fieldPath": "email", - "nullCount": 0 - }, - { - "fieldPath": "name", - "nullCount": 0 - }, - { - "fieldPath": "person_id", - "nullCount": 0 - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731579516960, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "json": { - "timestampMillis": 1731579516904, - "partitionSpec": { - "partition": "FULL_TABLE_SNAPSHOT", - "type": "FULL_TABLE" - }, - "rowCount": 0, - "columnCount": 2, - "fieldProfiles": [ - { - "fieldPath": "counter_column", - "nullCount": 0 - }, - { - "fieldPath": "id", - "nullCount": 0 - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731579516915, + "lastObserved": 1739924675568, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516939, + "timestampMillis": 1739924675564, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2593,19 +2818,19 @@ } }, "systemMetadata": { - "lastObserved": 1731579516950, + "lastObserved": 1739924675587, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516950, + "timestampMillis": 1739924675586, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2629,76 +2854,43 @@ } }, "systemMetadata": { - "lastObserved": 1731579516959, + "lastObserved": 1739924675599, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516925, + "timestampMillis": 1739924675598, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" }, - "rowCount": 9, + "rowCount": 0, "columnCount": 3, "fieldProfiles": [ { - "fieldPath": "item_count", - "uniqueCount": 5, - "nullCount": 4, - "min": "2", - "max": "100", - "mean": "46.4", - "median": "50.0", - "stdev": "38.44", - "sampleValues": [ - "5", - "100", - "75", - "2", - "50" - ] + "fieldPath": "email", + "nullCount": 0 }, { - "fieldPath": "last_update_timestamp", - "uniqueCount": 9, - "nullCount": 0, - "min": "2024-11-01 00:00:00", - "max": "2024-11-09 00:00:00", - "sampleValues": [ - "2024-11-08 00:00:00", - "2024-11-06 00:00:00", - "2024-11-02 00:00:00", - "2024-11-03 00:00:00", - "2024-11-05 00:00:00" - ] + "fieldPath": "name", + "nullCount": 0 }, { - "fieldPath": "userid", - "uniqueCount": 9, - "nullCount": 0, - "min": "1234", - "max": "9876", - "sampleValues": [ - "1240", - "1238", - "1234", - "1235", - "1237" - ] + "fieldPath": "person_id", + "nullCount": 0 } ] } }, "systemMetadata": { - "lastObserved": 1731579516939, + "lastObserved": 1739924675602, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/integration/cassandra/docker-compose.yml b/metadata-ingestion/tests/integration/cassandra/docker-compose.yml index a1a2a3b97d134b..f7509155597c77 100644 --- a/metadata-ingestion/tests/integration/cassandra/docker-compose.yml +++ b/metadata-ingestion/tests/integration/cassandra/docker-compose.yml @@ -1,4 +1,3 @@ -version: "1" services: test-cassandra: image: cassandra:latest @@ -6,7 +5,7 @@ services: ports: - 9042:9042 volumes: - - ./setup/cassandra.yaml:/etc/cassandra/cassandra.yaml + - ${CASSANDRA_CONFIG_DIR:-./setup}/cassandra.yaml:/etc/cassandra/cassandra.yaml - ./setup/init_keyspaces.cql:/docker-entrypoint-initdb.d/init_keyspaces.cql networks: - testnet diff --git a/metadata-ingestion/tests/integration/cassandra/test_cassandra.py b/metadata-ingestion/tests/integration/cassandra/test_cassandra.py index d561308aaad20e..822099903cabc8 100644 --- a/metadata-ingestion/tests/integration/cassandra/test_cassandra.py +++ b/metadata-ingestion/tests/integration/cassandra/test_cassandra.py @@ -1,4 +1,6 @@ import logging +import pathlib +import shutil import time import pytest @@ -9,25 +11,37 @@ logger = logging.getLogger(__name__) +_resources_dir = pathlib.Path(__file__).parent + @pytest.mark.integration -def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path): - test_resources_dir = pytestconfig.rootpath / "tests/integration/cassandra" +def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path, monkeypatch): + # Tricky: The cassandra container makes modifications directly to the cassandra.yaml + # config file. + # See https://github.com/docker-library/cassandra/issues/165 + # To avoid spurious diffs, we copy the config file to a temporary location + # and depend on that instead. The docker-compose file has the corresponding + # env variable usage to pick up the config file. + cassandra_config_file = _resources_dir / "setup/cassandra.yaml" + shutil.copy(cassandra_config_file, tmp_path / "cassandra.yaml") + monkeypatch.setenv("CASSANDRA_CONFIG_DIR", str(tmp_path)) with docker_compose_runner( - test_resources_dir / "docker-compose.yml", "cassandra" + _resources_dir / "docker-compose.yml", "cassandra" ) as docker_services: wait_for_port(docker_services, "test-cassandra", 9042) time.sleep(5) + # Run the metadata ingestion pipeline. logger.info("Starting the ingestion test...") - pipeline_default_platform_instance = Pipeline.create( + pipeline = Pipeline.create( { "run_id": "cassandra-test", "source": { "type": "cassandra", "config": { + "platform_instance": "dev_instance", "contact_point": "localhost", "port": 9042, "profiling": {"enabled": True}, @@ -41,13 +55,13 @@ def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path): }, } ) - pipeline_default_platform_instance.run() - pipeline_default_platform_instance.raise_from_status() + pipeline.run() + pipeline.raise_from_status() # Verify the output. logger.info("Verifying output.") mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/cassandra_mcps.json", - golden_path=test_resources_dir / "cassandra_mcps_golden.json", + golden_path=_resources_dir / "cassandra_mcps_golden.json", )