From fc7c27f0e1b5c40c7a03f0909391399d3e7342fb Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Wed, 2 Oct 2024 10:31:36 +1000 Subject: [PATCH] remove ordering by sys__id --- src/datachain/catalog/catalog.py | 2 -- src/datachain/data_storage/warehouse.py | 4 ---- src/datachain/query/batch.py | 1 - src/datachain/query/dataset.py | 16 ---------------- 4 files changed, 23 deletions(-) diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index 0902daf24..dbae1aa9f 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -1325,8 +1325,6 @@ def ls_dataset_rows( if offset: q = q.offset(offset) - q = q.order_by("sys__id") - return q.to_db_records() def signed_url(self, source: str, path: str, client_config=None) -> str: diff --git a/src/datachain/data_storage/warehouse.py b/src/datachain/data_storage/warehouse.py index 6b5b753d4..74b25045d 100644 --- a/src/datachain/data_storage/warehouse.py +++ b/src/datachain/data_storage/warehouse.py @@ -215,10 +215,6 @@ def dataset_select_paginated( limit = query._limit paginated_query = query.limit(page_size) - if not paginated_query._order_by_clauses: - # default order by is order by `sys__id` - paginated_query = paginated_query.order_by(query.selected_columns.sys__id) - results = None offset = 0 num_yielded = 0 diff --git a/src/datachain/query/batch.py b/src/datachain/query/batch.py index 2e63d91f6..525120018 100644 --- a/src/datachain/query/batch.py +++ b/src/datachain/query/batch.py @@ -107,7 +107,6 @@ def __call__( ordered_query = query.order_by(None).order_by( PARTITION_COLUMN_ID, - "sys__id", *query._order_by_clauses, ) diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index 2efb03e32..1f7ef3a1c 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -591,10 +591,6 @@ def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]: return query, [] table = self.catalog.warehouse.create_pre_udf_table(query) q: Select = sqlalchemy.select(*table.c) - if query._order_by_clauses: - # we are adding ordering only if it's explicitly added by user in - # query part before adding signals - q = q.order_by(table.c.sys__id) return q, [table] def create_result_query( @@ -630,11 +626,6 @@ def q(*columns): else: res = sqlalchemy.select(*cols1).select_from(subq) - if query._order_by_clauses: - # if ordering is used in query part before adding signals, we - # will have it as order by id from select from pre-created udf table - res = res.order_by(subq.c.sys__id) - if self.partition_by is not None: subquery = res.subquery() res = sqlalchemy.select(*subquery.c).select_from(subquery) @@ -666,13 +657,6 @@ def create_udf_table(self, query: Select) -> "Table": def create_result_query( self, udf_table, query: Select ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]: - if not query._order_by_clauses: - # if we are not selecting all rows in UDF, we need to ensure that - # we get the same rows as we got as inputs of UDF since selecting - # without ordering can be non deterministic in some databases - c = query.selected_columns - query = query.order_by(c.sys__id) - udf_table_query = udf_table.select().subquery() udf_table_cols: list[sqlalchemy.Label[Any]] = [ label(c.name, c) for c in udf_table_query.columns