diff --git a/src/datachain/data_storage/metastore.py b/src/datachain/data_storage/metastore.py index 941505f3e..f4c1d40e9 100644 --- a/src/datachain/data_storage/metastore.py +++ b/src/datachain/data_storage/metastore.py @@ -15,7 +15,6 @@ from sqlalchemy import ( JSON, BigInteger, - Boolean, Column, DateTime, ForeignKey, @@ -228,7 +227,7 @@ def create_dataset_version( # noqa: PLR0913 self, dataset: DatasetRecord, version: int, - status: int = DatasetStatus.CREATED, + status: int, sources: str = "", feature_schema: Optional[dict] = None, query_script: str = "", @@ -448,7 +447,6 @@ def _datasets_columns(cls) -> list["SchemaItem"]: Column("name", Text, nullable=False), Column("description", Text), Column("labels", JSON, nullable=True), - Column("shadow", Boolean, nullable=False), Column("status", Integer, nullable=False), Column("feature_schema", JSON, nullable=True), Column("created_at", DateTime(timezone=True)), @@ -481,8 +479,11 @@ def _datasets_versions_columns(cls) -> list["SchemaItem"]: nullable=False, ), Column("version", Integer, nullable=False), - # adding default for now until we fully remove shadow datasets - Column("status", Integer, nullable=False, default=DatasetStatus.COMPLETE), + Column( + "status", + Integer, + nullable=False, + ), Column("feature_schema", JSON, nullable=True), Column("created_at", DateTime(timezone=True)), Column("finished_at", DateTime(timezone=True)), @@ -969,7 +970,6 @@ def create_dataset( # TODO abstract this method and add registered = True based on kwargs query = self._datasets_insert().values( name=name, - shadow=False, status=status, feature_schema=json.dumps(feature_schema or {}), created_at=datetime.now(timezone.utc), @@ -992,7 +992,7 @@ def create_dataset_version( # noqa: PLR0913 self, dataset: DatasetRecord, version: int, - status: int = DatasetStatus.CREATED, + status: int, sources: str = "", feature_schema: Optional[dict] = None, query_script: str = "", @@ -1018,7 +1018,7 @@ def create_dataset_version( # noqa: PLR0913 query = self._datasets_versions_insert().values( dataset_id=dataset.id, version=version, - status=status, # for now until we remove shadow datasets + status=status, feature_schema=json.dumps(feature_schema or {}), created_at=created_at or datetime.now(timezone.utc), finished_at=finished_at, diff --git a/src/datachain/data_storage/warehouse.py b/src/datachain/data_storage/warehouse.py index 795099a3a..6b5b753d4 100644 --- a/src/datachain/data_storage/warehouse.py +++ b/src/datachain/data_storage/warehouse.py @@ -919,9 +919,7 @@ def create_pre_udf_table(self, query: "Select") -> "Table": def is_temp_table_name(self, name: str) -> bool: """Returns if the given table name refers to a temporary or no longer needed table.""" - return name.startswith( - (self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_") - ) or name.endswith("_shadow") + return name.startswith((self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX)) def get_temp_table_names(self) -> list[str]: return [ diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index c6fd04eed..ecef8359a 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -267,7 +267,6 @@ class DatasetRecord: name: str description: Optional[str] labels: list[str] - shadow: bool schema: dict[str, Union[SQLType, type[SQLType]]] feature_schema: dict versions: list[DatasetVersion] @@ -296,7 +295,6 @@ def parse( # noqa: PLR0913 name: str, description: Optional[str], labels: str, - shadow: int, status: int, feature_schema: Optional[str], created_at: datetime, @@ -356,7 +354,6 @@ def parse( # noqa: PLR0913 name, description, labels_lst, - bool(shadow), cls.parse_schema(schema_dct), # type: ignore[arg-type] json.loads(feature_schema) if feature_schema else {}, [dataset_version], diff --git a/tests/conftest.py b/tests/conftest.py index 060d96fe4..08fc8f11a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -609,7 +609,6 @@ def dataset_record(): description="", labels=[], versions=[], - shadow=False, status=1, schema={}, feature_schema={}, diff --git a/tests/func/test_catalog.py b/tests/func/test_catalog.py index 25eacf581..8671c388d 100644 --- a/tests/func/test_catalog.py +++ b/tests/func/test_catalog.py @@ -834,14 +834,17 @@ def test_enlist_source_handles_file(cloud_test_catalog): def test_garbage_collect(cloud_test_catalog, from_cli, capsys): catalog = cloud_test_catalog.catalog assert catalog.get_temp_table_names() == [] - temp_tables = ["tmp_vc12F", "udf_jh653", "ds_shadow_12345", "old_ds_shadow"] + temp_tables = [ + "tmp_vc12F", + "udf_jh653", + ] for t in temp_tables: catalog.warehouse.create_udf_table(name=t) assert set(catalog.get_temp_table_names()) == set(temp_tables) if from_cli: garbage_collect(catalog) captured = capsys.readouterr() - assert captured.out == "Garbage collecting 4 tables.\n" + assert captured.out == "Garbage collecting 2 tables.\n" else: catalog.cleanup_tables(temp_tables) assert catalog.get_temp_table_names() == [] diff --git a/tests/func/test_pull.py b/tests/func/test_pull.py index da62f3a40..4b87d8275 100644 --- a/tests/func/test_pull.py +++ b/tests/func/test_pull.py @@ -113,7 +113,6 @@ def remote_dataset(remote_dataset_version, schema): "name": "remote", "description": "", "labels": [], - "shadow": False, "schema": schema, "status": 4, "feature_schema": {}, diff --git a/tests/unit/test_warehouse.py b/tests/unit/test_warehouse.py index 441ba8a6a..2951364dc 100644 --- a/tests/unit/test_warehouse.py +++ b/tests/unit/test_warehouse.py @@ -46,8 +46,6 @@ def test_serialize(sqlite_db): def test_is_temp_table_name(warehouse): assert warehouse.is_temp_table_name("tmp_vc12F") is True assert warehouse.is_temp_table_name("udf_jh653") is True - assert warehouse.is_temp_table_name("ds_shadow_12345") is True - assert warehouse.is_temp_table_name("old_ds_shadow") is True assert warehouse.is_temp_table_name("ds_my_dataset") is False assert warehouse.is_temp_table_name("src_my_bucket") is False assert warehouse.is_temp_table_name("ds_ds_my_query_script_1_1") is False