Skip to content

Commit

Permalink
remove legacy shadow attribute (#478)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon authored Sep 26, 2024
1 parent c53fc09 commit 85dfda2
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 20 deletions.
16 changes: 8 additions & 8 deletions src/datachain/data_storage/metastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from sqlalchemy import (
JSON,
BigInteger,
Boolean,
Column,
DateTime,
ForeignKey,
Expand Down Expand Up @@ -228,7 +227,7 @@ def create_dataset_version( # noqa: PLR0913
self,
dataset: DatasetRecord,
version: int,
status: int = DatasetStatus.CREATED,
status: int,
sources: str = "",
feature_schema: Optional[dict] = None,
query_script: str = "",
Expand Down Expand Up @@ -448,7 +447,6 @@ def _datasets_columns(cls) -> list["SchemaItem"]:
Column("name", Text, nullable=False),
Column("description", Text),
Column("labels", JSON, nullable=True),
Column("shadow", Boolean, nullable=False),
Column("status", Integer, nullable=False),
Column("feature_schema", JSON, nullable=True),
Column("created_at", DateTime(timezone=True)),
Expand Down Expand Up @@ -481,8 +479,11 @@ def _datasets_versions_columns(cls) -> list["SchemaItem"]:
nullable=False,
),
Column("version", Integer, nullable=False),
# adding default for now until we fully remove shadow datasets
Column("status", Integer, nullable=False, default=DatasetStatus.COMPLETE),
Column(
"status",
Integer,
nullable=False,
),
Column("feature_schema", JSON, nullable=True),
Column("created_at", DateTime(timezone=True)),
Column("finished_at", DateTime(timezone=True)),
Expand Down Expand Up @@ -969,7 +970,6 @@ def create_dataset(
# TODO abstract this method and add registered = True based on kwargs
query = self._datasets_insert().values(
name=name,
shadow=False,
status=status,
feature_schema=json.dumps(feature_schema or {}),
created_at=datetime.now(timezone.utc),
Expand All @@ -992,7 +992,7 @@ def create_dataset_version( # noqa: PLR0913
self,
dataset: DatasetRecord,
version: int,
status: int = DatasetStatus.CREATED,
status: int,
sources: str = "",
feature_schema: Optional[dict] = None,
query_script: str = "",
Expand All @@ -1018,7 +1018,7 @@ def create_dataset_version( # noqa: PLR0913
query = self._datasets_versions_insert().values(
dataset_id=dataset.id,
version=version,
status=status, # for now until we remove shadow datasets
status=status,
feature_schema=json.dumps(feature_schema or {}),
created_at=created_at or datetime.now(timezone.utc),
finished_at=finished_at,
Expand Down
4 changes: 1 addition & 3 deletions src/datachain/data_storage/warehouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,9 +919,7 @@ def create_pre_udf_table(self, query: "Select") -> "Table":
def is_temp_table_name(self, name: str) -> bool:
"""Returns if the given table name refers to a temporary
or no longer needed table."""
return name.startswith(
(self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX, "ds_shadow_")
) or name.endswith("_shadow")
return name.startswith((self.TMP_TABLE_NAME_PREFIX, self.UDF_TABLE_NAME_PREFIX))

def get_temp_table_names(self) -> list[str]:
return [
Expand Down
3 changes: 0 additions & 3 deletions src/datachain/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ class DatasetRecord:
name: str
description: Optional[str]
labels: list[str]
shadow: bool
schema: dict[str, Union[SQLType, type[SQLType]]]
feature_schema: dict
versions: list[DatasetVersion]
Expand Down Expand Up @@ -296,7 +295,6 @@ def parse( # noqa: PLR0913
name: str,
description: Optional[str],
labels: str,
shadow: int,
status: int,
feature_schema: Optional[str],
created_at: datetime,
Expand Down Expand Up @@ -356,7 +354,6 @@ def parse( # noqa: PLR0913
name,
description,
labels_lst,
bool(shadow),
cls.parse_schema(schema_dct), # type: ignore[arg-type]
json.loads(feature_schema) if feature_schema else {},
[dataset_version],
Expand Down
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,6 @@ def dataset_record():
description="",
labels=[],
versions=[],
shadow=False,
status=1,
schema={},
feature_schema={},
Expand Down
7 changes: 5 additions & 2 deletions tests/func/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,14 +834,17 @@ def test_enlist_source_handles_file(cloud_test_catalog):
def test_garbage_collect(cloud_test_catalog, from_cli, capsys):
catalog = cloud_test_catalog.catalog
assert catalog.get_temp_table_names() == []
temp_tables = ["tmp_vc12F", "udf_jh653", "ds_shadow_12345", "old_ds_shadow"]
temp_tables = [
"tmp_vc12F",
"udf_jh653",
]
for t in temp_tables:
catalog.warehouse.create_udf_table(name=t)
assert set(catalog.get_temp_table_names()) == set(temp_tables)
if from_cli:
garbage_collect(catalog)
captured = capsys.readouterr()
assert captured.out == "Garbage collecting 4 tables.\n"
assert captured.out == "Garbage collecting 2 tables.\n"
else:
catalog.cleanup_tables(temp_tables)
assert catalog.get_temp_table_names() == []
Expand Down
1 change: 0 additions & 1 deletion tests/func/test_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def remote_dataset(remote_dataset_version, schema):
"name": "remote",
"description": "",
"labels": [],
"shadow": False,
"schema": schema,
"status": 4,
"feature_schema": {},
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/test_warehouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ def test_serialize(sqlite_db):
def test_is_temp_table_name(warehouse):
assert warehouse.is_temp_table_name("tmp_vc12F") is True
assert warehouse.is_temp_table_name("udf_jh653") is True
assert warehouse.is_temp_table_name("ds_shadow_12345") is True
assert warehouse.is_temp_table_name("old_ds_shadow") is True
assert warehouse.is_temp_table_name("ds_my_dataset") is False
assert warehouse.is_temp_table_name("src_my_bucket") is False
assert warehouse.is_temp_table_name("ds_ds_my_query_script_1_1") is False
Expand Down

0 comments on commit 85dfda2

Please sign in to comment.