From e4d94e3c100a243c82091d1465a9d22448e1e448 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 20 Jan 2025 11:56:46 +0000 Subject: [PATCH 01/36] Minimal class and methods defined --- cpp/arcticdb/processing/clause.cpp | 24 ++++++++++++++ cpp/arcticdb/processing/clause.hpp | 27 +++++++++++++++ cpp/arcticdb/version/python_bindings.cpp | 4 +++ python/arcticdb/version_store/processing.py | 12 +++++++ .../test_symbol_concatenation.py | 33 +++++++++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index a69d9a671a..c1c3f708a2 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1217,4 +1217,28 @@ std::string DateRangeClause::to_string() const { return fmt::format("DATE RANGE {} - {}", start_, end_); } +ConcatClause::ConcatClause() { + +} + +std::vector> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector& ranges_and_keys) { + return {}; +} + +std::vector> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector>&& entity_ids_vec) { + return {}; +} + +std::vector ConcatClause::process(ARCTICDB_UNUSED std::vector&& entity_ids) const { + return {}; +} + +void ConcatClause::set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) { + +} + +std::string ConcatClause::to_string() const { + return "CONCAT"; +} + } diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 134780d8db..3a91557700 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -669,4 +669,31 @@ struct DateRangeClause { [[nodiscard]] std::string to_string() const; }; +struct ConcatClause { + ClauseInfo clause_info_; + std::shared_ptr component_manager_; + + ConcatClause(); + + ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause) + + [[nodiscard]] std::vector> structure_for_processing(std::vector& ranges_and_keys); + + [[nodiscard]] std::vector> structure_for_processing(std::vector>&& entity_ids_vec); + + [[nodiscard]] std::vector process(std::vector&& entity_ids) const; + + [[nodiscard]] const ClauseInfo& clause_info() const { + return clause_info_; + } + + void set_processing_config(const ProcessingConfig& processing_config); + + void set_component_manager(std::shared_ptr component_manager) { + component_manager_ = component_manager; + } + + [[nodiscard]] std::string to_string() const; +}; + }//namespace arcticdb diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index b839159843..9a4451a5e8 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -402,6 +402,10 @@ void register_bindings(py::module &version, py::exception>(version, "ConcatClause") + .def(py::init<>()) + .def("__str__", &ConcatClause::to_string); + py::class_>(version, "PythonVersionStoreReadQuery") .def(py::init()) .def_readwrite("columns",&ReadQuery::columns) diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py index f58944376f..6fc6527af1 100644 --- a/python/arcticdb/version_store/processing.py +++ b/python/arcticdb/version_store/processing.py @@ -33,6 +33,7 @@ from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause +from arcticdb_ext.version_store import ConcatClause as _ConcatClause from arcticdb_ext.version_store import RowRangeType as _RowRangeType from arcticdb_ext.version_store import ExpressionName as _ExpressionName from arcticdb_ext.version_store import ColumnName as _ColumnName @@ -311,6 +312,7 @@ class PythonRowRangeClause(NamedTuple): start: int = None end: int = None + # Would be cleaner if all Python*Clause classes were dataclasses, but this is used for pickling, so hard to change now @dataclass class PythonResampleClause: @@ -323,6 +325,11 @@ class PythonResampleClause: origin: Union[str, pd.Timestamp] = "epoch" +@dataclass +class PythonConcatClause: + pass + + class QueryBuilder: """ Build a query to process read results with. Syntax is designed to be similar to Pandas: @@ -904,6 +911,11 @@ def date_range(self, date_range: DateRangeInput): self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)] return self + def concat(self): + self.clauses = self.clauses + [_ConcatClause()] + self._python_clauses = self._python_clauses + [PythonConcatClause()] + return self + def __eq__(self, right): if not isinstance(right, QueryBuilder): return False diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py new file mode 100644 index 0000000000..0273b2b215 --- /dev/null +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -0,0 +1,33 @@ +""" +Copyright 2025 Man Group Operations Limited + +Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + +As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. +""" +import numpy as np +import pandas as pd +import pytest + +from arcticdb import col, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest +from arcticdb.util.test import assert_frame_equal + + +pytestmark = pytest.mark.pipeline + + +def test_symbol_concat_basic(lmdb_library): + lib = lmdb_library + df_1 = pd.DataFrame({"col1": np.arange(2, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=2)) + df_2 = pd.DataFrame({"col1": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-02-01", periods=3)) + df_3 = pd.DataFrame({"col1": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-03-01", periods=4)) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + lib.write("sym3", df_3) + + q = QueryBuilder() + q = q.concat() + received = lib.read_batch(["sym2", "sym3", "sym1"], query_builder=q).data + expected = pd.concat([df_1, df_2, df_3]) + assert_frame_equal(expected, received.data) + From cc896941f994e1ef99b510352a0f783a41875fce Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 20 Jan 2025 15:19:53 +0000 Subject: [PATCH 02/36] Remove C++ changes --- cpp/arcticdb/processing/clause.cpp | 24 --------------------- cpp/arcticdb/processing/clause.hpp | 27 ------------------------ cpp/arcticdb/version/python_bindings.cpp | 4 ---- 3 files changed, 55 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index c1c3f708a2..a69d9a671a 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1217,28 +1217,4 @@ std::string DateRangeClause::to_string() const { return fmt::format("DATE RANGE {} - {}", start_, end_); } -ConcatClause::ConcatClause() { - -} - -std::vector> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector& ranges_and_keys) { - return {}; -} - -std::vector> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector>&& entity_ids_vec) { - return {}; -} - -std::vector ConcatClause::process(ARCTICDB_UNUSED std::vector&& entity_ids) const { - return {}; -} - -void ConcatClause::set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) { - -} - -std::string ConcatClause::to_string() const { - return "CONCAT"; -} - } diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 3a91557700..134780d8db 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -669,31 +669,4 @@ struct DateRangeClause { [[nodiscard]] std::string to_string() const; }; -struct ConcatClause { - ClauseInfo clause_info_; - std::shared_ptr component_manager_; - - ConcatClause(); - - ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause) - - [[nodiscard]] std::vector> structure_for_processing(std::vector& ranges_and_keys); - - [[nodiscard]] std::vector> structure_for_processing(std::vector>&& entity_ids_vec); - - [[nodiscard]] std::vector process(std::vector&& entity_ids) const; - - [[nodiscard]] const ClauseInfo& clause_info() const { - return clause_info_; - } - - void set_processing_config(const ProcessingConfig& processing_config); - - void set_component_manager(std::shared_ptr component_manager) { - component_manager_ = component_manager; - } - - [[nodiscard]] std::string to_string() const; -}; - }//namespace arcticdb diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index 9a4451a5e8..b839159843 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -402,10 +402,6 @@ void register_bindings(py::module &version, py::exception>(version, "ConcatClause") - .def(py::init<>()) - .def("__str__", &ConcatClause::to_string); - py::class_>(version, "PythonVersionStoreReadQuery") .def(py::init()) .def_readwrite("columns",&ReadQuery::columns) From b5336227cddec7b863889b9c32f75c19fdc06fc0 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 20 Jan 2025 15:24:23 +0000 Subject: [PATCH 03/36] Reworked Python layer --- python/arcticdb/__init__.py | 1 + python/arcticdb/version_store/library.py | 25 +++++++++++++++++++ python/arcticdb/version_store/processing.py | 11 -------- .../test_symbol_concatenation.py | 25 ++++++++++++------- 4 files changed, 42 insertions(+), 20 deletions(-) diff --git a/python/arcticdb/__init__.py b/python/arcticdb/__init__.py index 2a1f78fc78..da7472be27 100644 --- a/python/arcticdb/__init__.py +++ b/python/arcticdb/__init__.py @@ -17,6 +17,7 @@ col, LazyDataFrame, LazyDataFrameCollection, + concat, StagedDataFinalizeMethod, WriteMetadataPayload ) diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index ddb070f09b..8d12db7187 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -478,6 +478,31 @@ def __repr__(self) -> str: return self.__str__() +class LazyDataFrameAfterJoin(QueryBuilder): + def __init__( + self, + lazy_dataframes: LazyDataFrameCollection, + ): + super().__init__() + self._lazy_dataframes = lazy_dataframes + + def collect(self) -> VersionedItem: + pass + + def __str__(self) -> str: + query_builder_repr = super().__str__() + return f"LazyDataFrameAfterJoin(Concat({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})" + + def __repr__(self) -> str: + return self.__str__() + + +def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin: + if not isinstance(lazy_dataframes, LazyDataFrameCollection): + lazy_dataframes = LazyDataFrameCollection(lazy_dataframes) + return LazyDataFrameAfterJoin(lazy_dataframes) + + def col(name: str) -> ExpressionNode: """ Placeholder for referencing columns by name in lazy dataframe operations before the underlying object has been diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py index 6fc6527af1..789d4f7378 100644 --- a/python/arcticdb/version_store/processing.py +++ b/python/arcticdb/version_store/processing.py @@ -33,7 +33,6 @@ from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause -from arcticdb_ext.version_store import ConcatClause as _ConcatClause from arcticdb_ext.version_store import RowRangeType as _RowRangeType from arcticdb_ext.version_store import ExpressionName as _ExpressionName from arcticdb_ext.version_store import ColumnName as _ColumnName @@ -325,11 +324,6 @@ class PythonResampleClause: origin: Union[str, pd.Timestamp] = "epoch" -@dataclass -class PythonConcatClause: - pass - - class QueryBuilder: """ Build a query to process read results with. Syntax is designed to be similar to Pandas: @@ -911,11 +905,6 @@ def date_range(self, date_range: DateRangeInput): self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)] return self - def concat(self): - self.clauses = self.clauses + [_ConcatClause()] - self._python_clauses = self._python_clauses + [PythonConcatClause()] - return self - def __eq__(self, right): if not isinstance(right, QueryBuilder): return False diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 0273b2b215..4c6de62042 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -9,7 +9,7 @@ import pandas as pd import pytest -from arcticdb import col, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest +from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest from arcticdb.util.test import assert_frame_equal @@ -18,16 +18,23 @@ def test_symbol_concat_basic(lmdb_library): lib = lmdb_library - df_1 = pd.DataFrame({"col1": np.arange(2, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=2)) - df_2 = pd.DataFrame({"col1": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-02-01", periods=3)) - df_3 = pd.DataFrame({"col1": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-03-01", periods=4)) + df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=3)) + df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-01-03", periods=4)) + df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range("2025-03-07", periods=5)) lib.write("sym1", df_1) lib.write("sym2", df_2) lib.write("sym3", df_3) - q = QueryBuilder() - q = q.concat() - received = lib.read_batch(["sym2", "sym3", "sym1"], query_builder=q).data - expected = pd.concat([df_1, df_2, df_3]) - assert_frame_equal(expected, received.data) + lazy_df_1 = lib.read("sym1", lazy=True) + lazy_df_2 = lib.read("sym2", lazy=True) + lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None)) + lazy_df_3 = lib.read("sym3", lazy=True) + + lazy_df = concat([lazy_df_2, lazy_df_3, lazy_df_1]) + + lazy_df.resample("2D").agg({"col": "sum"}) + + received = lazy_df.collect().data + expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"}) + assert_frame_equal(expected, received) From 28eda0882f46d8f9f23b2b3645574cf3147fc698 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 20 Jan 2025 15:25:16 +0000 Subject: [PATCH 04/36] Revert whitespace change --- python/arcticdb/version_store/processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py index 789d4f7378..f58944376f 100644 --- a/python/arcticdb/version_store/processing.py +++ b/python/arcticdb/version_store/processing.py @@ -311,7 +311,6 @@ class PythonRowRangeClause(NamedTuple): start: int = None end: int = None - # Would be cleaner if all Python*Clause classes were dataclasses, but this is used for pickling, so hard to change now @dataclass class PythonResampleClause: From e71b5eb4ef76fc6bb74c2898ead28a1afb89b4a7 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 20 Jan 2025 16:09:20 +0000 Subject: [PATCH 05/36] Sketch out API for reading multiple symbols with a joi --- python/arcticdb/version_store/_store.py | 13 ++++++++ python/arcticdb/version_store/library.py | 42 ++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 4a45c24fb7..32a32360a5 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -1065,6 +1065,19 @@ def _batch_read_to_versioned_items( versioned_items.append(vitem) return versioned_items + def _batch_read_with_join( + self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, join, query_builder + ): + implement_read_index = True + if columns: + columns = [self._resolve_empty_columns(c, implement_read_index) for c in columns] + version_queries = self._get_version_queries(len(symbols), as_ofs, iterate_snapshots_if_tombstoned=False) + # Take a copy as _get_read_queries can modify the input argument, which makes reusing the input counter-intuitive + per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders) + read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders) + read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False) + return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, join, query_builder.clauses))) + def batch_read_metadata( self, symbols: List[str], as_ofs: Optional[List[VersionQueryInput]] = None, **kwargs ) -> Dict[str, VersionedItem]: diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index 8d12db7187..a2e79ccf7f 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -482,12 +482,18 @@ class LazyDataFrameAfterJoin(QueryBuilder): def __init__( self, lazy_dataframes: LazyDataFrameCollection, + join: Any, ): super().__init__() self._lazy_dataframes = lazy_dataframes + self._join = join def collect(self) -> VersionedItem: - pass + if not len(self._lazy_dataframes._lazy_dataframes): + return [] + else: + lib = self._lazy_dataframes._lib + return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self._join, self) def __str__(self) -> str: query_builder_repr = super().__str__() @@ -500,7 +506,7 @@ def __repr__(self) -> str: def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin: if not isinstance(lazy_dataframes, LazyDataFrameCollection): lazy_dataframes = LazyDataFrameCollection(lazy_dataframes) - return LazyDataFrameAfterJoin(lazy_dataframes) + return LazyDataFrameAfterJoin(lazy_dataframes, "concat") def col(name: str) -> ExpressionNode: @@ -1668,6 +1674,38 @@ def handle_symbol(s_): iterate_snapshots_if_tombstoned=False, ) + def _read_batch_with_join( + self, + read_requests: List[ReadRequest], + join: Any, + query_builder: Optional[QueryBuilder] = None, + ) -> VersionedItem: + symbol_strings = [] + as_ofs = [] + date_ranges = [] + row_ranges = [] + columns = [] + per_symbol_query_builders = [] + + for r in read_requests: + symbol_strings.append(r.symbol) + as_ofs.append(r.as_of) + date_ranges.append(r.date_range) + row_ranges.append(r.row_range) + columns.append(r.columns) + per_symbol_query_builders.append(r.query_builder) + + return self._nvs._batch_read_with_join( + symbol_strings, + as_ofs, + date_ranges, + row_ranges, + columns, + per_symbol_query_builders, + join, + query_builder, + ) + def read_metadata(self, symbol: str, as_of: Optional[AsOf] = None) -> VersionedItem: """ Return the metadata saved for a symbol. This method is faster than read as it only loads the metadata, not the From 08126c3829b701ece24edd2eb263a4d8f0fa852b Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 21 Jan 2025 16:37:08 +0000 Subject: [PATCH 06/36] Wiring down to local_version_engine.cpp --- cpp/arcticdb/processing/query_planner.hpp | 1 + cpp/arcticdb/version/local_versioned_engine.cpp | 11 +++++++++++ cpp/arcticdb/version/local_versioned_engine.hpp | 10 ++++++++++ cpp/arcticdb/version/python_bindings.cpp | 13 +++++++++++++ cpp/arcticdb/version/version_core.hpp | 14 ++++++++++++++ cpp/arcticdb/version/version_store_api.cpp | 13 +++++++++++++ cpp/arcticdb/version/version_store_api.hpp | 9 +++++++++ 7 files changed, 71 insertions(+) diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp index 99106654e1..925ce42133 100644 --- a/cpp/arcticdb/processing/query_planner.hpp +++ b/cpp/arcticdb/processing/query_planner.hpp @@ -14,6 +14,7 @@ namespace arcticdb { +// TODO: Move these somewhere else, they are generally useful using GroupByClause = PartitionClause; using ClauseVariant = std::variant, std::shared_ptr, diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 99c1a245af..6ef2121c36 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1151,6 +1151,17 @@ std::vector> LocalVersionedEngine::ba return read_versions_or_errors; } +MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( + ARCTICDB_UNUSED const std::vector& stream_ids, + ARCTICDB_UNUSED const std::vector& version_queries, + ARCTICDB_UNUSED std::vector>& read_queries, + ARCTICDB_UNUSED const ReadOptions& read_options, + ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause + ARCTICDB_UNUSED const std::vector& post_join_clauses, + ARCTICDB_UNUSED std::any& handler_data) { + return {}; +} + void LocalVersionedEngine::write_version_and_prune_previous( bool prune_previous_versions, const AtomKey& new_version, diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp index da140ceec4..7dd892ee41 100644 --- a/cpp/arcticdb/version/local_versioned_engine.hpp +++ b/cpp/arcticdb/version/local_versioned_engine.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -286,6 +287,15 @@ class LocalVersionedEngine : public VersionedEngine { const ReadOptions& read_options, std::any& handler_data); + MultiSymbolReadOutput batch_read_with_join_internal( + const std::vector& stream_ids, + const std::vector& version_queries, + std::vector>& read_queries, + const ReadOptions& read_options, + const std::string& join, // TODO: Make a Clause or MultiSymbolClause + const std::vector& post_join_clauses, + std::any& handler_data); + std::vector> batch_read_descriptor_internal( const std::vector& stream_ids, const std::vector& version_queries, diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index b839159843..6d0ba15015 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -767,6 +767,19 @@ void register_bindings(py::module &version, py::exception(), "Read a dataframe from the store") + .def("batch_read_with_join", + [&](PythonVersionStore& v, + const std::vector &stream_ids, + const std::vector& version_queries, + std::vector>& read_queries, + const ReadOptions& read_options, + const std::string& join, // TODO: Make a Clause or MultiSymbolClause + std::vector post_join_clauses + ){ + auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); + return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data)); + }, + py::call_guard(), "Read a dataframe from the store") .def("batch_read_keys", [&](PythonVersionStore& v, std::vector atom_keys) { return python_util::adapt_read_dfs(frame_to_read_result(v.batch_read_keys(atom_keys))); diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index f9e0ae9294..09e8b49c4f 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -53,6 +53,20 @@ struct ReadVersionOutput { FrameAndDescriptor frame_and_descriptor_; }; +struct MultiSymbolReadOutput { + // TODO: delete this ctor +// MultiSymbolReadOutput() = delete; + MultiSymbolReadOutput() = default; + MultiSymbolReadOutput(std::vector&& versioned_items, FrameAndDescriptor&& frame_and_descriptor): + versioned_items_(std::move(versioned_items)), + frame_and_descriptor_(std::move(frame_and_descriptor)) {} + + ARCTICDB_MOVE_ONLY_DEFAULT(MultiSymbolReadOutput) + + std::vector versioned_items_; + FrameAndDescriptor frame_and_descriptor_; +}; + VersionedItem write_dataframe_impl( const std::shared_ptr& store, VersionId version_id, diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index ab8c369371..93d00ee6ef 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -787,6 +787,19 @@ std::vector> PythonVersionStore::batch_read( return res; } +ReadResult PythonVersionStore::batch_read_with_join( + const std::vector& stream_ids, + const std::vector& version_queries, + std::vector>& read_queries, + const ReadOptions& read_options, + const std::string& join, // TODO: Make a Clause or MultiSymbolClause + const std::vector& post_join_clauses, + std::any& handler_data) { + auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data); + // TODO: Decide what to do with VersionedItem part of read result + return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_)); +} + void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) { ARCTICDB_RUNTIME_DEBUG(log::version(), "Command: delete_snapshot"); auto opt_snapshot = get_snapshot(store(), snap_name); diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp index e8d3057e34..2f132c18bb 100644 --- a/cpp/arcticdb/version/version_store_api.hpp +++ b/cpp/arcticdb/version/version_store_api.hpp @@ -300,6 +300,15 @@ class PythonVersionStore : public LocalVersionedEngine { const ReadOptions& read_options, std::any& handler_data); + ReadResult batch_read_with_join( + const std::vector& stream_ids, + const std::vector& version_queries, + std::vector>& read_queries, + const ReadOptions& read_options, + const std::string& join, // TODO: Make a Clause or MultiSymbolClause + const std::vector& post_join_clauses, + std::any& handler_data); + std::vector, DataError>> batch_read_metadata( const std::vector& stream_ids, const std::vector& version_queries, From d9107e5722b56f8e6bc51399161f5ae35f065d1d Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 21 Jan 2025 16:57:09 +0000 Subject: [PATCH 07/36] Use Poly Clause from VersionStoreApi level downwards --- cpp/arcticdb/processing/query_planner.hpp | 1 - cpp/arcticdb/version/local_versioned_engine.cpp | 2 +- cpp/arcticdb/version/local_versioned_engine.hpp | 3 +-- cpp/arcticdb/version/python_bindings.cpp | 11 ++++++++++- cpp/arcticdb/version/version_store_api.cpp | 4 ++-- cpp/arcticdb/version/version_store_api.hpp | 2 +- 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp index 925ce42133..99106654e1 100644 --- a/cpp/arcticdb/processing/query_planner.hpp +++ b/cpp/arcticdb/processing/query_planner.hpp @@ -14,7 +14,6 @@ namespace arcticdb { -// TODO: Move these somewhere else, they are generally useful using GroupByClause = PartitionClause; using ClauseVariant = std::variant, std::shared_ptr, diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 6ef2121c36..f2d592c6d6 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1157,7 +1157,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( ARCTICDB_UNUSED std::vector>& read_queries, ARCTICDB_UNUSED const ReadOptions& read_options, ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause - ARCTICDB_UNUSED const std::vector& post_join_clauses, + ARCTICDB_UNUSED std::vector>&& post_join_clauses, ARCTICDB_UNUSED std::any& handler_data) { return {}; } diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp index 7dd892ee41..fbc01ff899 100644 --- a/cpp/arcticdb/version/local_versioned_engine.hpp +++ b/cpp/arcticdb/version/local_versioned_engine.hpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -293,7 +292,7 @@ class LocalVersionedEngine : public VersionedEngine { std::vector>& read_queries, const ReadOptions& read_options, const std::string& join, // TODO: Make a Clause or MultiSymbolClause - const std::vector& post_join_clauses, + std::vector>&& post_join_clauses, std::any& handler_data); std::vector> batch_read_descriptor_internal( diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index 6d0ba15015..4742592718 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -777,7 +777,16 @@ void register_bindings(py::module &version, py::exception post_join_clauses ){ auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); - return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data)); + // TODO: Remove duplication with ReadQuery interface + post_join_clauses = plan_query(std::move(post_join_clauses)); + std::vector> _clauses; + for (auto&& clause: post_join_clauses) { + util::variant_match( + clause, + [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} + ); + } + return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, std::move(_clauses), handler_data)); }, py::call_guard(), "Read a dataframe from the store") .def("batch_read_keys", diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index 93d00ee6ef..2be57afaf0 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -793,9 +793,9 @@ ReadResult PythonVersionStore::batch_read_with_join( std::vector>& read_queries, const ReadOptions& read_options, const std::string& join, // TODO: Make a Clause or MultiSymbolClause - const std::vector& post_join_clauses, + std::vector>&& post_join_clauses, std::any& handler_data) { - auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data); + auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, std::move(post_join_clauses), handler_data); // TODO: Decide what to do with VersionedItem part of read result return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_)); } diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp index 2f132c18bb..2a834f6eda 100644 --- a/cpp/arcticdb/version/version_store_api.hpp +++ b/cpp/arcticdb/version/version_store_api.hpp @@ -306,7 +306,7 @@ class PythonVersionStore : public LocalVersionedEngine { std::vector>& read_queries, const ReadOptions& read_options, const std::string& join, // TODO: Make a Clause or MultiSymbolClause - const std::vector& post_join_clauses, + std::vector>&& post_join_clauses, std::any& handler_data); std::vector, DataError>> batch_read_metadata( From 634247a983de0ca89588c2c9e6db0a9149f224cf Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Tue, 21 Jan 2025 18:23:40 +0000 Subject: [PATCH 08/36] Batch read and get set of entity IDs on a single ComponentManager for all individual symbol queries (untested) --- cpp/arcticdb/async/task_scheduler.hpp | 6 +- .../version/local_versioned_engine.cpp | 30 +++++++ cpp/arcticdb/version/version_core.cpp | 85 +++++++++++++++++++ cpp/arcticdb/version/version_core.hpp | 8 ++ 4 files changed, 126 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/async/task_scheduler.hpp b/cpp/arcticdb/async/task_scheduler.hpp index 087bd9975e..38c86d6a66 100644 --- a/cpp/arcticdb/async/task_scheduler.hpp +++ b/cpp/arcticdb/async/task_scheduler.hpp @@ -179,10 +179,10 @@ class TaskScheduler { using CPUSchedulerType = folly::FutureExecutor; using IOSchedulerType = folly::FutureExecutor; - explicit TaskScheduler(const std::optional& cpu_thread_count = std::nullopt, const std::optional& io_thread_count = std::nullopt) : + explicit TaskScheduler(ARCTICDB_UNUSED const std::optional& cpu_thread_count = std::nullopt, ARCTICDB_UNUSED const std::optional& io_thread_count = std::nullopt) : cgroup_folder_("/sys/fs/cgroup"), - cpu_thread_count_(cpu_thread_count ? *cpu_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumCPUThreads", get_default_num_cpus(cgroup_folder_))), - io_thread_count_(io_thread_count ? *io_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumIOThreads", (int) (cpu_thread_count_ * 1.5))), + cpu_thread_count_(1), + io_thread_count_(1), cpu_exec_(cpu_thread_count_, std::make_shared("CPUPool")) , io_exec_(io_thread_count_, std::make_shared("IOPool")){ util::check(cpu_thread_count_ > 0 && io_thread_count_ > 0, "Zero IO or CPU threads: {} {}", io_thread_count_, cpu_thread_count_); diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index f2d592c6d6..8f74214ea9 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1159,6 +1159,36 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause ARCTICDB_UNUSED std::vector>&& post_join_clauses, ARCTICDB_UNUSED std::any& handler_data) { + py::gil_scoped_release release_gil; + auto opt_index_key_futs = batch_get_versions_async(store(), version_map(), stream_ids, version_queries); + std::vector>> symbol_entities_futs; + symbol_entities_futs.reserve(opt_index_key_futs.size()); + auto component_manager = std::make_shared(); + for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) { + symbol_entities_futs.emplace_back( + std::move(opt_index_key_futs[idx]).thenValue([store = store(), + idx, + &stream_ids, + &version_queries, + read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], + &read_options, + &component_manager](auto&& opt_index_key) { + std::variant version_info; + if (opt_index_key.has_value()) { + version_info = VersionedItem(std::move(*opt_index_key)); + } else { + if (opt_false(read_options.incompletes_)) { + log::version().warn("No index: Key not found for {}, will attempt to use incomplete segments.", stream_ids[idx]); + version_info = stream_ids[idx]; + } else { + missing_data::raise( + "batch_read_internal: version matching query '{}' not found for symbol '{}'", version_queries[idx], stream_ids[idx]); + } + } + return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager); + }) + ); + } return {}; } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index e81118a16b..aa55c2485e 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -922,6 +922,43 @@ folly::Future> read_and_process( }); } +folly::Future> read_and_process_2( + const std::shared_ptr& store, + const std::shared_ptr& pipeline_context, + const std::shared_ptr& read_query, + const ReadOptions& read_options, + std::shared_ptr component_manager + ) { + ProcessingConfig processing_config{opt_false(read_options.dynamic_schema_), pipeline_context->rows_}; + for (auto& clause: read_query->clauses_) { + clause->set_processing_config(processing_config); + clause->set_component_manager(component_manager); + } + + auto ranges_and_keys = generate_ranges_and_keys(*pipeline_context); + + // Each element of the vector corresponds to one processing unit containing the list of indexes in ranges_and_keys required for that processing unit + // i.e. if the first processing unit needs ranges_and_keys[0] and ranges_and_keys[1], and the second needs ranges_and_keys[2] and ranges_and_keys[3] + // then the structure will be {{0, 1}, {2, 3}} + std::vector> processing_unit_indexes; + if (!read_query->clauses_.empty()) { + processing_unit_indexes = read_query->clauses_[0]->structure_for_processing( + ranges_and_keys); + } else { + processing_unit_indexes = structure_by_row_slice(ranges_and_keys); + } + + // Start reading as early as possible + auto segment_and_slice_futures = generate_segment_and_slice_futures(store, pipeline_context, processing_config, std::move(ranges_and_keys)); + + return schedule_clause_processing( + component_manager, + std::move(segment_and_slice_futures), + std::move(processing_unit_indexes), + std::make_shared>>(read_query->clauses_)) + .via(&async::cpu_executor()); +} + void add_index_columns_to_query(const ReadQuery& read_query, const TimeseriesDescriptor& desc) { if (read_query.columns.has_value()) { auto index_columns = stream::get_index_columns_from_descriptor(desc); @@ -1535,6 +1572,16 @@ folly::Future do_direct_read_or_process( } } +folly::Future> do_process( + const std::shared_ptr& store, + const std::shared_ptr& read_query, + const ReadOptions& read_options, + const std::shared_ptr& pipeline_context, + std::shared_ptr component_manager) { + util::check_rte(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data"); + return read_and_process_2(store, pipeline_context, read_query, read_options, component_manager); +} + VersionedItem collate_and_write( const std::shared_ptr& store, const std::shared_ptr& pipeline_context, @@ -2027,6 +2074,44 @@ folly::Future read_frame_for_version( }); }); } + +folly::Future> read_entity_ids_for_version( + const std::shared_ptr& store, + const std::variant& version_info, + const std::shared_ptr& read_query , + const ReadOptions& read_options, + std::shared_ptr component_manager) { + using namespace arcticdb::pipelines; + auto pipeline_context = std::make_shared(); + VersionedItem res_versioned_item; + + if(std::holds_alternative(version_info)) { + pipeline_context->stream_id_ = std::get(version_info); + } else { + pipeline_context->stream_id_ = std::get(version_info).key_.id(); + read_indexed_keys_to_pipeline(store, pipeline_context, std::get(version_info), *read_query, read_options); + } + + user_input::check(!pipeline_context->multi_key_, "Multi-symbol joines not supported with recursively normalized data"); + + if(opt_false(read_options.incompletes_)) { + util::check(std::holds_alternative(read_query->row_filter), "Streaming read requires date range filter"); + const auto& query_range = std::get(read_query->row_filter); + const auto existing_range = pipeline_context->index_range(); + if(!existing_range.specified_ || query_range.end_ > existing_range.end_) + read_incompletes_to_pipeline(store, pipeline_context, *read_query, read_options, false, false, false, opt_false(read_options.dynamic_schema_)); + } + + if(std::holds_alternative(version_info) && !pipeline_context->incompletes_after_) { + return std::vector{}; + } + + modify_descriptor(pipeline_context, read_options); + generate_filtered_field_descriptors(pipeline_context, read_query->columns); + ARCTICDB_DEBUG(log::version(), "Fetching data to frame"); + + return version_store::do_process(store, read_query, read_options, pipeline_context, component_manager); +} } //namespace arcticdb::version_store namespace arcticdb { diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index 09e8b49c4f..6353213448 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -225,6 +225,14 @@ folly::Future read_frame_for_version( std::any& handler_data ); +folly::Future> read_entity_ids_for_version( + const std::shared_ptr& store, + const std::variant& version_info, + const std::shared_ptr& read_query, + const ReadOptions& read_options, + std::shared_ptr component_manager +); + class DeleteIncompleteKeysOnExit { public: DeleteIncompleteKeysOnExit( From 4aafe67fa140c310a561d1f57b0b14500805c35c Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 22 Jan 2025 10:06:41 +0000 Subject: [PATCH 09/36] Fix schedule_clause_processing when there are zero clauses --- cpp/arcticdb/version/local_versioned_engine.cpp | 1 + cpp/arcticdb/version/version_core.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 8f74214ea9..78cb56de1b 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1189,6 +1189,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( }) ); } + auto symbol_entities = folly::collect(symbol_entities_futs).get(); return {}; } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index aa55c2485e..f8473ff22b 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -528,6 +528,9 @@ void add_slice_to_component_manager( } size_t num_scheduling_iterations(const std::vector>& clauses) { + if (clauses.empty()) { + return 0; + } size_t res = 1UL; auto it = std::next(clauses.cbegin()); while (it != clauses.cend()) { From ec189c7ad108b99bdf0bbee9b401ade9e08fdd76 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 22 Jan 2025 13:52:28 +0000 Subject: [PATCH 10/36] Produce one SegmentInMemory from output of multiple symbol's pipelines --- cpp/arcticdb/version/local_versioned_engine.cpp | 16 +++++++++++++++- cpp/arcticdb/version/version_core.cpp | 5 ++--- cpp/arcticdb/version/version_core.hpp | 7 +++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 78cb56de1b..62c1435f7f 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1189,7 +1189,21 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( }) ); } - auto symbol_entities = folly::collect(symbol_entities_futs).get(); + ARCTICDB_UNUSED auto segment_in_memory = folly::collect(symbol_entities_futs) + .via(&async::cpu_executor()) + .thenValue([](std::vector>&& entity_id_vectors) { + return flatten_entities(std::move(entity_id_vectors)); + }).thenValueInline([component_manager](std::vector&& processed_entity_ids) { + auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); + // TODO: Handle output descriptors + return collect_segments(std::move(proc)); + }).thenValueInline([store=store(), &handler_data](std::vector&& slice_and_keys) { + auto pipeline_context = std::make_shared(); + internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); + pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor()); + ReadOptions read_options; + return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, read_options, handler_data); + }).get(); return {}; } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index f8473ff22b..8d6af65072 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -1304,7 +1304,7 @@ void copy_frame_data_to_buffer( } struct CopyToBufferTask : async::BaseTask { - SegmentInMemory&& source_segment_; + SegmentInMemory source_segment_; SegmentInMemory target_segment_; FrameSlice frame_slice_; DecodePathData shared_data_; @@ -1362,11 +1362,10 @@ folly::Future copy_segments_to_frame( DecodePathData shared_data; for (auto context_row : folly::enumerate(*pipeline_context)) { auto &slice_and_key = context_row->slice_and_key(); - auto &segment = slice_and_key.segment(store); copy_tasks.emplace_back(async::submit_cpu_task( CopyToBufferTask{ - std::move(segment), + slice_and_key.release_segment(store), frame, context_row->slice_and_key().slice(), shared_data, diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index 6353213448..1e5894f556 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -261,6 +261,13 @@ std::optional get_delete_keys_on_failure( const std::shared_ptr& store, const CompactIncompleteOptions& options); +folly::Future prepare_output_frame( + std::vector&& items, + const std::shared_ptr& pipeline_context, + const std::shared_ptr& store, + const ReadOptions& read_options, + std::any& handler_data); + } //namespace arcticdb::version_store namespace arcticdb { From b5e10a0223a2805bfd7bbe2647b11a756ee09b9b Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 22 Jan 2025 14:49:37 +0000 Subject: [PATCH 11/36] First, most basic, test passing --- .../version/local_versioned_engine.cpp | 41 +++++++++++-------- cpp/arcticdb/version/version_core.hpp | 4 +- .../test_symbol_concatenation.py | 6 +-- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 62c1435f7f..88e127003f 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1164,6 +1164,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( std::vector>> symbol_entities_futs; symbol_entities_futs.reserve(opt_index_key_futs.size()); auto component_manager = std::make_shared(); + auto pipeline_context = std::make_shared(); + auto norm_meta_mtx = std::make_shared(); + DecodePathData shared_data; for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) { symbol_entities_futs.emplace_back( std::move(opt_index_key_futs[idx]).thenValue([store = store(), @@ -1172,24 +1175,22 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( &version_queries, read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], &read_options, - &component_manager](auto&& opt_index_key) { + &component_manager, + pipeline_context, + norm_meta_mtx](auto&& opt_index_key) mutable { std::variant version_info; - if (opt_index_key.has_value()) { - version_info = VersionedItem(std::move(*opt_index_key)); - } else { - if (opt_false(read_options.incompletes_)) { - log::version().warn("No index: Key not found for {}, will attempt to use incomplete segments.", stream_ids[idx]); - version_info = stream_ids[idx]; - } else { - missing_data::raise( - "batch_read_internal: version matching query '{}' not found for symbol '{}'", version_queries[idx], stream_ids[idx]); - } + internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); + auto index_key_seg = store->read_sync(*opt_index_key).second; + { + std::lock_guard lock(*norm_meta_mtx); + pipeline_context->norm_meta_ =std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); } + version_info = VersionedItem(std::move(*opt_index_key)); return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager); }) ); } - ARCTICDB_UNUSED auto segment_in_memory = folly::collect(symbol_entities_futs) + return folly::collect(symbol_entities_futs) .via(&async::cpu_executor()) .thenValue([](std::vector>&& entity_id_vectors) { return flatten_entities(std::move(entity_id_vectors)); @@ -1197,14 +1198,20 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); // TODO: Handle output descriptors return collect_segments(std::move(proc)); - }).thenValueInline([store=store(), &handler_data](std::vector&& slice_and_keys) { - auto pipeline_context = std::make_shared(); + }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor()); - ReadOptions read_options; - return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, read_options, handler_data); + return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); + }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable { + return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) + .thenValue([pipeline_context, frame, shared_data](auto&&) mutable { + return MultiSymbolReadOutput{{}, + {frame, + timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_), + {}, + shared_data.buffers()}}; + }); }).get(); - return {}; } void LocalVersionedEngine::write_version_and_prune_previous( diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index 1e5894f556..8d5aaec183 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -54,9 +54,7 @@ struct ReadVersionOutput { }; struct MultiSymbolReadOutput { - // TODO: delete this ctor -// MultiSymbolReadOutput() = delete; - MultiSymbolReadOutput() = default; + MultiSymbolReadOutput() = delete; MultiSymbolReadOutput(std::vector&& versioned_items, FrameAndDescriptor&& frame_and_descriptor): versioned_items_(std::move(versioned_items)), frame_and_descriptor_(std::move(frame_and_descriptor)) {} diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 4c6de62042..57b541027c 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -30,11 +30,11 @@ def test_symbol_concat_basic(lmdb_library): lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None)) lazy_df_3 = lib.read("sym3", lazy=True) - lazy_df = concat([lazy_df_2, lazy_df_3, lazy_df_1]) + lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) - lazy_df.resample("2D").agg({"col": "sum"}) + # lazy_df.resample("2D").agg({"col": "sum"}) received = lazy_df.collect().data - expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"}) + expected = pd.concat([df_1, df_2.iloc[1:], df_3]) #.resample("2D").agg({"col": "sum"}) assert_frame_equal(expected, received) From 400facf0ab2bb30711095011aaf32d3330a3e491 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 22 Jan 2025 16:22:55 +0000 Subject: [PATCH 12/36] Refactor schedule_remaining_iterations into own method --- .../version/local_versioned_engine.cpp | 16 +++--- cpp/arcticdb/version/version_core.cpp | 54 ++++++++++--------- .../test_symbol_concatenation.py | 4 +- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 88e127003f..b9b2612cb1 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1170,14 +1170,12 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) { symbol_entities_futs.emplace_back( std::move(opt_index_key_futs[idx]).thenValue([store = store(), - idx, - &stream_ids, - &version_queries, - read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], - &read_options, - &component_manager, - pipeline_context, - norm_meta_mtx](auto&& opt_index_key) mutable { + idx, + read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], + &read_options, + &component_manager, + pipeline_context, + norm_meta_mtx](auto&& opt_index_key) mutable { std::variant version_info; internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); auto index_key_seg = store->read_sync(*opt_index_key).second; @@ -1190,6 +1188,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( }) ); } +// auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor()); + return folly::collect(symbol_entities_futs) .via(&async::cpu_executor()) .thenValue([](std::vector>&& entity_id_vectors) { diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 8d6af65072..203753b02d 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -640,6 +640,34 @@ std::shared_ptr>>> schedule_firs return futures; } +folly::Future> schedule_remaining_iterations( + folly::Future>>&& entity_ids_vec_fut, + std::shared_ptr>> clauses + ) { + const auto scheduling_iterations = num_scheduling_iterations(*clauses); + for (auto i = 1UL; i < scheduling_iterations; ++i) { + entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector>&& entity_id_vectors) { + ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations); + + util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations); + remove_processed_clauses(*clauses); + auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors)); + + std::vector>> work_futures; + for(auto&& unit_of_work : next_units_of_work) { + ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling work for entity ids: {}", unit_of_work); + work_futures.emplace_back(async::submit_cpu_task(async::MemSegmentProcessingTask{*clauses, std::move(unit_of_work)})); + } + + return folly::collect(work_futures).via(&async::io_executor()); + }); + } + + return std::move(entity_ids_vec_fut).thenValueInline([](std::vector>&& entity_id_vectors) { + return flatten_entities(std::move(entity_id_vectors)); + }); +} + folly::Future> schedule_clause_processing( std::shared_ptr component_manager, std::vector>&& segment_and_slice_futures, @@ -672,29 +700,7 @@ folly::Future> schedule_clause_processing( clauses); auto entity_ids_vec_fut = folly::collect(*futures).via(&async::io_executor()); - - const auto scheduling_iterations = num_scheduling_iterations(*clauses); - for (auto i = 1UL; i < scheduling_iterations; ++i) { - entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector>&& entity_id_vectors) { - ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations); - - util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations); - remove_processed_clauses(*clauses); - auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors)); - - std::vector>> work_futures; - for(auto&& unit_of_work : next_units_of_work) { - ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling work for entity ids: {}", unit_of_work); - work_futures.emplace_back(async::submit_cpu_task(async::MemSegmentProcessingTask{*clauses, std::move(unit_of_work)})); - } - - return folly::collect(work_futures).via(&async::io_executor()); - }); - } - - return std::move(entity_ids_vec_fut).thenValueInline([](std::vector>&& entity_id_vectors) { - return flatten_entities(std::move(entity_id_vectors)); - }); + return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses); } void set_output_descriptors( @@ -2112,7 +2118,7 @@ folly::Future> read_entity_ids_for_version( generate_filtered_field_descriptors(pipeline_context, read_query->columns); ARCTICDB_DEBUG(log::version(), "Fetching data to frame"); - return version_store::do_process(store, read_query, read_options, pipeline_context, component_manager); + return do_process(store, read_query, read_options, pipeline_context, component_manager); } } //namespace arcticdb::version_store diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 57b541027c..d2888e48ca 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -32,9 +32,9 @@ def test_symbol_concat_basic(lmdb_library): lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) - # lazy_df.resample("2D").agg({"col": "sum"}) + lazy_df.resample("2D").agg({"col": "sum"}) received = lazy_df.collect().data - expected = pd.concat([df_1, df_2.iloc[1:], df_3]) #.resample("2D").agg({"col": "sum"}) + expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"}) assert_frame_equal(expected, received) From 82b2290cbdb85f3c7d2ecbfee8db274ac9726373 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 24 Jan 2025 10:28:55 +0000 Subject: [PATCH 13/36] Added ConcatClause --- cpp/arcticdb/processing/clause.cpp | 56 +++++++++++++++++++ cpp/arcticdb/processing/clause.hpp | 31 ++++++++++ cpp/arcticdb/processing/clause_utils.hpp | 3 +- .../version/local_versioned_engine.cpp | 49 +++++++++++----- .../version/local_versioned_engine.hpp | 3 +- cpp/arcticdb/version/python_bindings.cpp | 13 ++++- cpp/arcticdb/version/version_core.cpp | 18 ++++-- cpp/arcticdb/version/version_core.hpp | 5 ++ cpp/arcticdb/version/version_store_api.cpp | 5 +- cpp/arcticdb/version/version_store_api.hpp | 3 +- python/arcticdb/version_store/library.py | 6 +- .../test_symbol_concatenation.py | 12 ++-- 12 files changed, 168 insertions(+), 36 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index a69d9a671a..d0a9a282da 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1217,4 +1217,60 @@ std::string DateRangeClause::to_string() const { return fmt::format("DATE RANGE {} - {}", start_, end_); } +ConcatClause::ConcatClause() { + clause_info_.input_structure_ = ProcessingStructure::MULTI_SYMBOL; +} + +std::vector> ConcatClause::structure_for_processing(std::vector>&& entity_ids_vec) { + auto entity_ids = flatten_entities(std::move(entity_ids_vec)); + if (entity_ids.empty()) { + return {}; + } + auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr, std::shared_ptr>(entity_ids, false); + + // Map from old row ranges to new ones + std::map row_range_mapping; + for (const auto& row_range: old_row_ranges) { + // Value is same as key initially + row_range_mapping.insert({*row_range, *row_range}); + } + bool first_range{true}; + size_t prev_range_end{0}; + for (auto& [old_range, new_range]: row_range_mapping) { + if (first_range) { + // Make the first row-range start from zero + new_range.first = 0; + new_range.second = old_range.diff(); + first_range = false; + } else { + new_range.first = prev_range_end; + new_range.second = new_range.first + old_range.diff(); + } + prev_range_end = new_range.second; + } + + std::vector> new_row_ranges; + new_row_ranges.reserve(old_row_ranges.size()); + std::vector ranges_and_entities; + ranges_and_entities.reserve(entity_ids.size()); + for (size_t idx=0; idx(row_range_mapping.at(*old_row_ranges[idx])); + ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]); + new_row_ranges.emplace_back(std::move(new_row_range)); + } + + component_manager_->replace_entities>(entity_ids, new_row_ranges); + + auto new_structure_offsets = structure_by_row_slice(ranges_and_entities); + return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities); +} + +std::vector ConcatClause::process(std::vector&& entity_ids) const { + return std::move(entity_ids); +} + +std::string ConcatClause::to_string() const { + return "CONCAT"; +} + } diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 134780d8db..004cbedc0f 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -669,4 +669,35 @@ struct DateRangeClause { [[nodiscard]] std::string to_string() const; }; +struct ConcatClause { + + ClauseInfo clause_info_; + std::shared_ptr component_manager_; + + ConcatClause(); + + ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause) + + [[nodiscard]] std::vector> structure_for_processing(ARCTICDB_UNUSED std::vector& ranges_and_keys) { + internal::raise("ConcatClause should never be first in the pipeline"); + } + + [[nodiscard]] std::vector> structure_for_processing(std::vector>&& entity_ids_vec); + + [[nodiscard]] std::vector process(std::vector&& entity_ids) const; + + [[nodiscard]] const ClauseInfo& clause_info() const { + return clause_info_; + } + + void set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) { + } + + void set_component_manager(std::shared_ptr component_manager) { + component_manager_ = component_manager; + } + + [[nodiscard]] std::string to_string() const; +}; + }//namespace arcticdb diff --git a/cpp/arcticdb/processing/clause_utils.hpp b/cpp/arcticdb/processing/clause_utils.hpp index aa3ef0c1d1..a7c9446b66 100644 --- a/cpp/arcticdb/processing/clause_utils.hpp +++ b/cpp/arcticdb/processing/clause_utils.hpp @@ -28,7 +28,8 @@ enum class ProcessingStructure { ROW_SLICE, TIME_BUCKETED, HASH_BUCKETED, - ALL + ALL, + MULTI_SYMBOL }; struct KeepCurrentIndex{}; diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index b9b2612cb1..8c4a3e9c93 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1152,13 +1152,12 @@ std::vector> LocalVersionedEngine::ba } MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( - ARCTICDB_UNUSED const std::vector& stream_ids, - ARCTICDB_UNUSED const std::vector& version_queries, - ARCTICDB_UNUSED std::vector>& read_queries, - ARCTICDB_UNUSED const ReadOptions& read_options, - ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause - ARCTICDB_UNUSED std::vector>&& post_join_clauses, - ARCTICDB_UNUSED std::any& handler_data) { + const std::vector& stream_ids, + const std::vector& version_queries, + std::vector>& read_queries, + const ReadOptions& read_options, + std::vector>&& clauses, + std::any& handler_data) { py::gil_scoped_release release_gil; auto opt_index_key_futs = batch_get_versions_async(store(), version_map(), stream_ids, version_queries); std::vector>> symbol_entities_futs; @@ -1188,13 +1187,14 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( }) ); } -// auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor()); + auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor()); + for (auto& clause: clauses) { + clause->set_component_manager(component_manager); + } + auto clauses_ptr = std::make_shared>>(std::move(clauses)); - return folly::collect(symbol_entities_futs) - .via(&async::cpu_executor()) - .thenValue([](std::vector>&& entity_id_vectors) { - return flatten_entities(std::move(entity_id_vectors)); - }).thenValueInline([component_manager](std::vector&& processed_entity_ids) { + return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses_ptr, true) + .thenValueInline([component_manager](std::vector&& processed_entity_ids) { auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); // TODO: Handle output descriptors return collect_segments(std::move(proc)); @@ -1212,6 +1212,29 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( shared_data.buffers()}}; }); }).get(); + +// return folly::collect(symbol_entities_futs) +// .via(&async::cpu_executor()) +// .thenValue([](std::vector>&& entity_id_vectors) { +// return flatten_entities(std::move(entity_id_vectors)); +// }).thenValueInline([component_manager](std::vector&& processed_entity_ids) { +// auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); +// // TODO: Handle output descriptors +// return collect_segments(std::move(proc)); +// }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { +// internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); +// pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor()); +// return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); +// }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable { +// return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) +// .thenValue([pipeline_context, frame, shared_data](auto&&) mutable { +// return MultiSymbolReadOutput{{}, +// {frame, +// timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_), +// {}, +// shared_data.buffers()}}; +// }); +// }).get(); } void LocalVersionedEngine::write_version_and_prune_previous( diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp index fbc01ff899..c3aea05425 100644 --- a/cpp/arcticdb/version/local_versioned_engine.hpp +++ b/cpp/arcticdb/version/local_versioned_engine.hpp @@ -291,8 +291,7 @@ class LocalVersionedEngine : public VersionedEngine { const std::vector& version_queries, std::vector>& read_queries, const ReadOptions& read_options, - const std::string& join, // TODO: Make a Clause or MultiSymbolClause - std::vector>&& post_join_clauses, + std::vector>&& clauses, std::any& handler_data); std::vector> batch_read_descriptor_internal( diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index 4742592718..e2d99bea7d 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -402,6 +402,10 @@ void register_bindings(py::module &version, py::exception>(version, "ConcatClause") + .def(py::init<>()) + .def("__str__", &ConcatClause::to_string); + py::class_>(version, "PythonVersionStoreReadQuery") .def(py::init()) .def_readwrite("columns",&ReadQuery::columns) @@ -773,20 +777,25 @@ void register_bindings(py::module &version, py::exception& version_queries, std::vector>& read_queries, const ReadOptions& read_options, - const std::string& join, // TODO: Make a Clause or MultiSymbolClause + ClauseVariant join, std::vector post_join_clauses ){ auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); // TODO: Remove duplication with ReadQuery interface post_join_clauses = plan_query(std::move(post_join_clauses)); std::vector> _clauses; + // TODO: Verify at some point that this is a multi-symbol clause + util::variant_match( + join, + [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} + ); for (auto&& clause: post_join_clauses) { util::variant_match( clause, [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} ); } - return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, std::move(_clauses), handler_data)); + return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, std::move(_clauses), handler_data)); }, py::call_guard(), "Read a dataframe from the store") .def("batch_read_keys", diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 203753b02d..3749f46cff 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -642,15 +642,25 @@ std::shared_ptr>>> schedule_firs folly::Future> schedule_remaining_iterations( folly::Future>>&& entity_ids_vec_fut, - std::shared_ptr>> clauses + std::shared_ptr>> clauses, + const bool from_join ) { - const auto scheduling_iterations = num_scheduling_iterations(*clauses); + auto scheduling_iterations = num_scheduling_iterations(*clauses); + if (from_join) { + ++scheduling_iterations; + } for (auto i = 1UL; i < scheduling_iterations; ++i) { - entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector>&& entity_id_vectors) { + entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector>&& entity_id_vectors) { ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations); util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations); - remove_processed_clauses(*clauses); + if (i == 1) { + if (!from_join) { + remove_processed_clauses(*clauses); + } + } else { + remove_processed_clauses(*clauses); + } auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors)); std::vector>> work_futures; diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index 8d5aaec183..408d41acb7 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -144,6 +144,11 @@ folly::Future read_multi_key( const SegmentInMemory& index_key_seg, std::any& handler_data); +folly::Future> schedule_remaining_iterations( + folly::Future>>&& entity_ids_vec_fut, + std::shared_ptr>> clauses, + const bool from_join = false); + folly::Future> schedule_clause_processing( std::shared_ptr component_manager, std::vector>&& segment_and_slice_futures, diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index 2be57afaf0..ddb299b81e 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -792,10 +792,9 @@ ReadResult PythonVersionStore::batch_read_with_join( const std::vector& version_queries, std::vector>& read_queries, const ReadOptions& read_options, - const std::string& join, // TODO: Make a Clause or MultiSymbolClause - std::vector>&& post_join_clauses, + std::vector>&& clauses, std::any& handler_data) { - auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, std::move(post_join_clauses), handler_data); + auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data); // TODO: Decide what to do with VersionedItem part of read result return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_)); } diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp index 2a834f6eda..e24f38113a 100644 --- a/cpp/arcticdb/version/version_store_api.hpp +++ b/cpp/arcticdb/version/version_store_api.hpp @@ -305,8 +305,7 @@ class PythonVersionStore : public LocalVersionedEngine { const std::vector& version_queries, std::vector>& read_queries, const ReadOptions& read_options, - const std::string& join, // TODO: Make a Clause or MultiSymbolClause - std::vector>&& post_join_clauses, + std::vector>&& clauses, std::any& handler_data); std::vector, DataError>> batch_read_metadata( diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index a2e79ccf7f..f8bca21629 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -22,7 +22,7 @@ from arcticdb.version_store.processing import ExpressionNode, QueryBuilder from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput from arcticdb_ext.exceptions import ArcticException -from arcticdb_ext.version_store import DataError +from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause import pandas as pd import numpy as np import logging @@ -497,7 +497,7 @@ def collect(self) -> VersionedItem: def __str__(self) -> str: query_builder_repr = super().__str__() - return f"LazyDataFrameAfterJoin(Concat({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})" + return f"LazyDataFrameAfterJoin({self._join}({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})" def __repr__(self) -> str: return self.__str__() @@ -506,7 +506,7 @@ def __repr__(self) -> str: def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin: if not isinstance(lazy_dataframes, LazyDataFrameCollection): lazy_dataframes = LazyDataFrameCollection(lazy_dataframes) - return LazyDataFrameAfterJoin(lazy_dataframes, "concat") + return LazyDataFrameAfterJoin(lazy_dataframes, _ConcatClause()) def col(name: str) -> ExpressionNode: diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index d2888e48ca..e87da49d33 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -18,23 +18,23 @@ def test_symbol_concat_basic(lmdb_library): lib = lmdb_library - df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=3)) - df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-01-03", periods=4)) - df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range("2025-03-07", periods=5)) + df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)) + df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)) + df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)) lib.write("sym1", df_1) lib.write("sym2", df_2) lib.write("sym3", df_3) lazy_df_1 = lib.read("sym1", lazy=True) lazy_df_2 = lib.read("sym2", lazy=True) - lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None)) + lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) lazy_df_3 = lib.read("sym3", lazy=True) lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) - lazy_df.resample("2D").agg({"col": "sum"}) + lazy_df.resample("2000ns").agg({"col": "sum"}) received = lazy_df.collect().data - expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"}) + expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"}) assert_frame_equal(expected, received) From 727cd34b0cd249de63b0078dba3772aa38e303d3 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 24 Jan 2025 11:13:03 +0000 Subject: [PATCH 14/36] Fix ConcatClause::structure_for_processing --- cpp/arcticdb/processing/clause.cpp | 30 ++++++++++------------- cpp/arcticdb/processing/query_planner.hpp | 3 ++- cpp/arcticdb/version/python_bindings.cpp | 10 +------- cpp/arcticdb/version/version_core.cpp | 3 --- 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index d0a9a282da..ffdf619721 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1228,35 +1228,31 @@ std::vector> ConcatClause::structure_for_processing(std::v } auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr, std::shared_ptr>(entity_ids, false); - // Map from old row ranges to new ones - std::map row_range_mapping; - for (const auto& row_range: old_row_ranges) { - // Value is same as key initially - row_range_mapping.insert({*row_range, *row_range}); - } + // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different bool first_range{true}; size_t prev_range_end{0}; - for (auto& [old_range, new_range]: row_range_mapping) { + std::vector> new_row_ranges; + new_row_ranges.reserve(old_row_ranges.size()); + // TODO: Handle column-slicing + for (const auto& old_range: old_row_ranges) { + auto new_range = std::make_shared(); if (first_range) { // Make the first row-range start from zero - new_range.first = 0; - new_range.second = old_range.diff(); + new_range->first = 0; + new_range->second = old_range->diff(); first_range = false; } else { - new_range.first = prev_range_end; - new_range.second = new_range.first + old_range.diff(); + new_range->first = prev_range_end; + new_range->second = new_range->first + old_range->diff(); } - prev_range_end = new_range.second; + prev_range_end = new_range->second; + new_row_ranges.emplace_back(std::move(new_range)); } - std::vector> new_row_ranges; - new_row_ranges.reserve(old_row_ranges.size()); std::vector ranges_and_entities; ranges_and_entities.reserve(entity_ids.size()); for (size_t idx=0; idx(row_range_mapping.at(*old_row_ranges[idx])); - ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]); - new_row_ranges.emplace_back(std::move(new_row_range)); + ranges_and_entities.emplace_back(entity_ids[idx], new_row_ranges[idx], col_ranges[idx]); } component_manager_->replace_entities>(entity_ids, new_row_ranges); diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp index 99106654e1..b16b79fb40 100644 --- a/cpp/arcticdb/processing/query_planner.hpp +++ b/cpp/arcticdb/processing/query_planner.hpp @@ -22,7 +22,8 @@ using ClauseVariant = std::variant, std::shared_ptr>, std::shared_ptr>, std::shared_ptr, - std::shared_ptr>; + std::shared_ptr, + std::shared_ptr>; std::vector plan_query(std::vector&& clauses); diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index e2d99bea7d..c191f84618 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -414,15 +414,7 @@ void register_bindings(py::module &version, py::exception, - std::shared_ptr, - std::shared_ptr, - std::shared_ptr, - std::shared_ptr>, - std::shared_ptr>, - std::shared_ptr, - std::shared_ptr>> clauses) { + [](ReadQuery& self, std::vector clauses) { clauses = plan_query(std::move(clauses)); std::vector> _clauses; self.needs_post_processing = false; diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 3749f46cff..61adb4a71b 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -646,9 +646,6 @@ folly::Future> schedule_remaining_iterations( const bool from_join ) { auto scheduling_iterations = num_scheduling_iterations(*clauses); - if (from_join) { - ++scheduling_iterations; - } for (auto i = 1UL; i < scheduling_iterations; ++i) { entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector>&& entity_id_vectors) { ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations); From 139f3af3c57c3c884e47c0102738a208ea280f9c Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Fri, 24 Jan 2025 11:49:26 +0000 Subject: [PATCH 15/36] First end-to-end test passing! --- cpp/arcticdb/version/version_core.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 61adb4a71b..3749f46cff 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -646,6 +646,9 @@ folly::Future> schedule_remaining_iterations( const bool from_join ) { auto scheduling_iterations = num_scheduling_iterations(*clauses); + if (from_join) { + ++scheduling_iterations; + } for (auto i = 1UL; i < scheduling_iterations; ++i) { entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector>&& entity_id_vectors) { ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations); From 548e5b763690482eefe11cc1de7f28519246e82c Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:08:34 +0000 Subject: [PATCH 16/36] Removed TODOs --- cpp/arcticdb/processing/clause.cpp | 1 - .../version/local_versioned_engine.cpp | 24 ------------------- cpp/arcticdb/version/python_bindings.cpp | 2 -- cpp/arcticdb/version/version_store_api.cpp | 1 - 4 files changed, 28 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index ffdf619721..2a8c919a32 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1233,7 +1233,6 @@ std::vector> ConcatClause::structure_for_processing(std::v size_t prev_range_end{0}; std::vector> new_row_ranges; new_row_ranges.reserve(old_row_ranges.size()); - // TODO: Handle column-slicing for (const auto& old_range: old_row_ranges) { auto new_range = std::make_shared(); if (first_range) { diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 8c4a3e9c93..01e7a96a1f 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1196,7 +1196,6 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses_ptr, true) .thenValueInline([component_manager](std::vector&& processed_entity_ids) { auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); - // TODO: Handle output descriptors return collect_segments(std::move(proc)); }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); @@ -1212,29 +1211,6 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( shared_data.buffers()}}; }); }).get(); - -// return folly::collect(symbol_entities_futs) -// .via(&async::cpu_executor()) -// .thenValue([](std::vector>&& entity_id_vectors) { -// return flatten_entities(std::move(entity_id_vectors)); -// }).thenValueInline([component_manager](std::vector&& processed_entity_ids) { -// auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); -// // TODO: Handle output descriptors -// return collect_segments(std::move(proc)); -// }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { -// internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); -// pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor()); -// return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); -// }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable { -// return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) -// .thenValue([pipeline_context, frame, shared_data](auto&&) mutable { -// return MultiSymbolReadOutput{{}, -// {frame, -// timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_), -// {}, -// shared_data.buffers()}}; -// }); -// }).get(); } void LocalVersionedEngine::write_version_and_prune_previous( diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index c191f84618..1db69e0762 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -773,10 +773,8 @@ void register_bindings(py::module &version, py::exception post_join_clauses ){ auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); - // TODO: Remove duplication with ReadQuery interface post_join_clauses = plan_query(std::move(post_join_clauses)); std::vector> _clauses; - // TODO: Verify at some point that this is a multi-symbol clause util::variant_match( join, [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index ddb299b81e..917d259526 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -795,7 +795,6 @@ ReadResult PythonVersionStore::batch_read_with_join( std::vector>&& clauses, std::any& handler_data) { auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data); - // TODO: Decide what to do with VersionedItem part of read result return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_)); } From f466fe227ad4c117358152d3942e75025dcdf267 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:23:47 +0000 Subject: [PATCH 17/36] Verify that clause for join is multi-symbol --- cpp/arcticdb/processing/clause.cpp | 1 + cpp/arcticdb/processing/clause_utils.hpp | 2 ++ cpp/arcticdb/version/python_bindings.cpp | 7 ++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 2a8c919a32..e1b083146f 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1219,6 +1219,7 @@ std::string DateRangeClause::to_string() const { ConcatClause::ConcatClause() { clause_info_.input_structure_ = ProcessingStructure::MULTI_SYMBOL; + clause_info_.multi_symbol_ = true; } std::vector> ConcatClause::structure_for_processing(std::vector>&& entity_ids_vec) { diff --git a/cpp/arcticdb/processing/clause_utils.hpp b/cpp/arcticdb/processing/clause_utils.hpp index a7c9446b66..84951730f7 100644 --- a/cpp/arcticdb/processing/clause_utils.hpp +++ b/cpp/arcticdb/processing/clause_utils.hpp @@ -53,6 +53,8 @@ struct ClauseInfo { std::variant index_{KeepCurrentIndex()}; // Whether this clause modifies the output descriptor bool modifies_output_descriptor_{false}; + // Whether this clause operates on one or multiple symbols + bool multi_symbol_{false}; }; // Changes how the clause behaves based on information only available after it is constructed diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index 1db69e0762..f097678504 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -777,7 +777,12 @@ void register_bindings(py::module &version, py::exception> _clauses; util::variant_match( join, - [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} + [&](auto&& clause) { + user_input::check( + clause->clause_info().multi_symbol_, + "Single-symbol clause cannot be used to join multiple symbols together"); + _clauses.emplace_back(std::make_shared(*clause)); + } ); for (auto&& clause: post_join_clauses) { util::variant_match( From 096ff095b8e868b0dbe5c90cb2ffbb1d580b953f Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:40:38 +0000 Subject: [PATCH 18/36] Added failing test with column slicing --- python/tests/conftest.py | 7 +++ .../test_symbol_concatenation.py | 50 +++++++++++++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/python/tests/conftest.py b/python/tests/conftest.py index c9e3f4a8d5..a198dbaf9d 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -136,6 +136,13 @@ def lmdb_library_static_dynamic(request): yield request.getfixturevalue(request.param) +@pytest.fixture +def lmdb_library_factory(lmdb_storage, lib_name): + def f(library_options: LibraryOptions = LibraryOptions()): + return lmdb_storage.create_arctic().create_library(lib_name, library_options=library_options) + return f + + # ssl is enabled by default to maximize test coverage as ssl is enabled most of the times in real world @pytest.fixture(scope="session") def s3_storage_factory() -> Generator[MotoS3StorageFixtureFactory, None, None]: diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index e87da49d33..68284b8aaa 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -10,14 +10,14 @@ import pytest from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest +from arcticdb.options import LibraryOptions from arcticdb.util.test import assert_frame_equal - pytestmark = pytest.mark.pipeline -def test_symbol_concat_basic(lmdb_library): - lib = lmdb_library +def test_symbol_concat_basic(lmdb_library_factory): + lib = lmdb_library_factory() df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)) df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)) df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)) @@ -38,3 +38,47 @@ def test_symbol_concat_basic(lmdb_library): expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"}) assert_frame_equal(expected, received) + +def test_symbol_concat_column_sliced(lmdb_library_factory): + lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + df_1 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + "col2": np.arange(10, 13, dtype=np.int64), + "col3": np.arange(110, 113, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3) + ) + df_2 = pd.DataFrame( + { + "col1": np.arange(4, dtype=np.int64), + "col2": np.arange(20, 24, dtype=np.int64), + "col3": np.arange(210, 214, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4) + ) + df_3 = pd.DataFrame( + { + "col1": np.arange(5, dtype=np.int64), + "col2": np.arange(30, 35, dtype=np.int64), + "col3": np.arange(310, 315, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5) + ) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + lib.write("sym3", df_3) + + lazy_df_1 = lib.read("sym1", lazy=True) + lazy_df_2 = lib.read("sym2", lazy=True) + lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) + lazy_df_3 = lib.read("sym3", lazy=True) + + lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) + + lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + + received = lazy_df.collect().data + received = received.reindex(columns=sorted(received.columns)) + expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + assert_frame_equal(expected, received) From 4b4cad63ef7154e363c0da102b233b924adefeb4 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:48:18 +0000 Subject: [PATCH 19/36] Simplify column slicing test --- .../test_symbol_concatenation.py | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 68284b8aaa..0142728996 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -40,7 +40,8 @@ def test_symbol_concat_basic(lmdb_library_factory): def test_symbol_concat_column_sliced(lmdb_library_factory): - lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + # lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + lib = lmdb_library_factory() df_1 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), @@ -55,30 +56,14 @@ def test_symbol_concat_column_sliced(lmdb_library_factory): "col2": np.arange(20, 24, dtype=np.int64), "col3": np.arange(210, 214, dtype=np.int64), }, - index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4) - ) - df_3 = pd.DataFrame( - { - "col1": np.arange(5, dtype=np.int64), - "col2": np.arange(30, 35, dtype=np.int64), - "col3": np.arange(310, 315, dtype=np.int64), - }, - index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5) + index=pd.date_range(pd.Timestamp(3000), freq="1000ns", periods=4) ) lib.write("sym1", df_1) lib.write("sym2", df_2) - lib.write("sym3", df_3) - - lazy_df_1 = lib.read("sym1", lazy=True) - lazy_df_2 = lib.read("sym2", lazy=True) - lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) - lazy_df_3 = lib.read("sym3", lazy=True) - - lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) + lazy_df = concat(lib.read_batch(["sym1", "sym2"], lazy=True)) lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) - received = lazy_df.collect().data received = received.reindex(columns=sorted(received.columns)) - expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + expected = pd.concat([df_1, df_2]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) assert_frame_equal(expected, received) From d8de667ca408d7fe633af4d7cdd67809a01d7e35 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:48:45 +0000 Subject: [PATCH 20/36] Remove unneeded segments in ConcatClause::structure_for_processing --- cpp/arcticdb/processing/clause.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index e1b083146f..9cae430190 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1227,7 +1227,7 @@ std::vector> ConcatClause::structure_for_processing(std::v if (entity_ids.empty()) { return {}; } - auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr, std::shared_ptr>(entity_ids, false); + auto [old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr>(entity_ids, false); // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different bool first_range{true}; From 7718bec059cf23df001689dcefbb9a6a44448878 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 10:58:13 +0000 Subject: [PATCH 21/36] Column slicing test passing --- cpp/arcticdb/processing/clause.cpp | 95 +++++++++++++------ .../test_symbol_concatenation.py | 4 +- 2 files changed, 68 insertions(+), 31 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 9cae430190..6a2ed5c03e 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1223,42 +1223,79 @@ ConcatClause::ConcatClause() { } std::vector> ConcatClause::structure_for_processing(std::vector>&& entity_ids_vec) { - auto entity_ids = flatten_entities(std::move(entity_ids_vec)); - if (entity_ids.empty()) { - return {}; - } - auto [old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr>(entity_ids, false); - // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different + std::vector ranges_and_entities; bool first_range{true}; size_t prev_range_end{0}; - std::vector> new_row_ranges; - new_row_ranges.reserve(old_row_ranges.size()); - for (const auto& old_range: old_row_ranges) { - auto new_range = std::make_shared(); - if (first_range) { - // Make the first row-range start from zero - new_range->first = 0; - new_range->second = old_range->diff(); - first_range = false; - } else { - new_range->first = prev_range_end; - new_range->second = new_range->first + old_range->diff(); + for (const auto& entity_ids: entity_ids_vec) { + auto [old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr>(entity_ids, false); + // Map from old row ranges WITHIN THIS SYMBOL to new ones + std::map row_range_mapping; + for (const auto& row_range: old_row_ranges) { + // Value is same as key initially + row_range_mapping.insert({*row_range, *row_range}); } - prev_range_end = new_range->second; - new_row_ranges.emplace_back(std::move(new_range)); - } - - std::vector ranges_and_entities; - ranges_and_entities.reserve(entity_ids.size()); - for (size_t idx=0; idx> new_row_ranges; + new_row_ranges.reserve(old_row_ranges.size()); + for (size_t idx=0; idx(row_range_mapping.at(*old_row_ranges[idx])); + ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]); + new_row_ranges.emplace_back(std::move(new_row_range)); + } + component_manager_->replace_entities>(entity_ids, new_row_ranges); } - - component_manager_->replace_entities>(entity_ids, new_row_ranges); - auto new_structure_offsets = structure_by_row_slice(ranges_and_entities); return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities); + + +// auto entity_ids = flatten_entities(std::move(entity_ids_vec)); +// if (entity_ids.empty()) { +// return {}; +// } +// auto [old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr>(entity_ids, false); +// +// +// bool first_range{true}; +// size_t prev_range_end{0}; +// std::vector> new_row_ranges; +// new_row_ranges.reserve(old_row_ranges.size()); +// for (const auto& old_range: old_row_ranges) { +// auto new_range = std::make_shared(); +// if (first_range) { +// // Make the first row-range start from zero +// new_range->first = 0; +// new_range->second = old_range->diff(); +// first_range = false; +// } else { +// new_range->first = prev_range_end; +// new_range->second = new_range->first + old_range->diff(); +// } +// prev_range_end = new_range->second; +// new_row_ranges.emplace_back(std::move(new_range)); +// } +// +// std::vector ranges_and_entities; +// ranges_and_entities.reserve(entity_ids.size()); +// for (size_t idx=0; idxreplace_entities>(entity_ids, new_row_ranges); +// +// auto new_structure_offsets = structure_by_row_slice(ranges_and_entities); +// return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities); } std::vector ConcatClause::process(std::vector&& entity_ids) const { diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 0142728996..6a4defccbd 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -40,8 +40,8 @@ def test_symbol_concat_basic(lmdb_library_factory): def test_symbol_concat_column_sliced(lmdb_library_factory): - # lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) - lib = lmdb_library_factory() + lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + # lib = lmdb_library_factory() df_1 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), From f089c7c9a7f2601608a2c09a67dddae5fa7c8cd9 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 11:08:24 +0000 Subject: [PATCH 22/36] Refactor to only call ComponentManager::replace_entities once --- cpp/arcticdb/processing/clause.cpp | 44 ++---------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 6a2ed5c03e..c4344a3704 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -1225,6 +1225,7 @@ ConcatClause::ConcatClause() { std::vector> ConcatClause::structure_for_processing(std::vector>&& entity_ids_vec) { // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different std::vector ranges_and_entities; + std::vector> new_row_ranges; bool first_range{true}; size_t prev_range_end{0}; for (const auto& entity_ids: entity_ids_vec) { @@ -1247,55 +1248,16 @@ std::vector> ConcatClause::structure_for_processing(std::v } prev_range_end = new_range.second; } - std::vector> new_row_ranges; - new_row_ranges.reserve(old_row_ranges.size()); + for (size_t idx=0; idx(row_range_mapping.at(*old_row_ranges[idx])); ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]); new_row_ranges.emplace_back(std::move(new_row_range)); } - component_manager_->replace_entities>(entity_ids, new_row_ranges); } + component_manager_->replace_entities>(flatten_entities(std::move(entity_ids_vec)), new_row_ranges); auto new_structure_offsets = structure_by_row_slice(ranges_and_entities); return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities); - - -// auto entity_ids = flatten_entities(std::move(entity_ids_vec)); -// if (entity_ids.empty()) { -// return {}; -// } -// auto [old_row_ranges, col_ranges] = component_manager_->get_entities, std::shared_ptr>(entity_ids, false); -// -// -// bool first_range{true}; -// size_t prev_range_end{0}; -// std::vector> new_row_ranges; -// new_row_ranges.reserve(old_row_ranges.size()); -// for (const auto& old_range: old_row_ranges) { -// auto new_range = std::make_shared(); -// if (first_range) { -// // Make the first row-range start from zero -// new_range->first = 0; -// new_range->second = old_range->diff(); -// first_range = false; -// } else { -// new_range->first = prev_range_end; -// new_range->second = new_range->first + old_range->diff(); -// } -// prev_range_end = new_range->second; -// new_row_ranges.emplace_back(std::move(new_range)); -// } -// -// std::vector ranges_and_entities; -// ranges_and_entities.reserve(entity_ids.size()); -// for (size_t idx=0; idxreplace_entities>(entity_ids, new_row_ranges); -// -// auto new_structure_offsets = structure_by_row_slice(ranges_and_entities); -// return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities); } std::vector ConcatClause::process(std::vector&& entity_ids) const { From fb46e0356a14ae0635e2afbc47dd5dc904cf4c24 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 11:11:13 +0000 Subject: [PATCH 23/36] Test with row slicing --- .../test_symbol_concatenation.py | 34 +++---------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 6a4defccbd..87773317eb 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -16,8 +16,10 @@ pytestmark = pytest.mark.pipeline -def test_symbol_concat_basic(lmdb_library_factory): - lib = lmdb_library_factory() +@pytest.mark.parametrize("rows_per_segment", [2, 100_000]) +@pytest.mark.parametrize("columns_per_segment", [2, 100_000]) +def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment): + lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)) df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)) df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)) @@ -39,31 +41,3 @@ def test_symbol_concat_basic(lmdb_library_factory): assert_frame_equal(expected, received) -def test_symbol_concat_column_sliced(lmdb_library_factory): - lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) - # lib = lmdb_library_factory() - df_1 = pd.DataFrame( - { - "col1": np.arange(3, dtype=np.int64), - "col2": np.arange(10, 13, dtype=np.int64), - "col3": np.arange(110, 113, dtype=np.int64), - }, - index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3) - ) - df_2 = pd.DataFrame( - { - "col1": np.arange(4, dtype=np.int64), - "col2": np.arange(20, 24, dtype=np.int64), - "col3": np.arange(210, 214, dtype=np.int64), - }, - index=pd.date_range(pd.Timestamp(3000), freq="1000ns", periods=4) - ) - lib.write("sym1", df_1) - lib.write("sym2", df_2) - - lazy_df = concat(lib.read_batch(["sym1", "sym2"], lazy=True)) - lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) - received = lazy_df.collect().data - received = received.reindex(columns=sorted(received.columns)) - expected = pd.concat([df_1, df_2]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) - assert_frame_equal(expected, received) From 70a67fa648ea14e2661211c55f6d57376aaed997 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 11:17:55 +0000 Subject: [PATCH 24/36] Use folly::enumerate over index-based loop --- cpp/arcticdb/version/local_versioned_engine.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 01e7a96a1f..7431768606 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1166,15 +1166,14 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto pipeline_context = std::make_shared(); auto norm_meta_mtx = std::make_shared(); DecodePathData shared_data; - for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) { + for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) { symbol_entities_futs.emplace_back( - std::move(opt_index_key_futs[idx]).thenValue([store = store(), - idx, - read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], - &read_options, - &component_manager, - pipeline_context, - norm_meta_mtx](auto&& opt_index_key) mutable { + std::move(opt_index_key_fut).thenValue([store = store(), + read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], + &read_options, + &component_manager, + pipeline_context, + norm_meta_mtx](auto&& opt_index_key) mutable { std::variant version_info; internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); auto index_key_seg = store->read_sync(*opt_index_key).second; From 0fd27e34212ab7afd09dbcc24ea5001c83610381 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Mon, 27 Jan 2025 17:43:17 +0000 Subject: [PATCH 25/36] Improve schema checking --- cpp/arcticdb/processing/clause.cpp | 3 +- .../version/local_versioned_engine.cpp | 83 ++++++++++++++++++- .../test_symbol_concatenation.py | 59 +++++++++++-- 3 files changed, 135 insertions(+), 10 deletions(-) diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index c4344a3704..76fa60831c 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -647,6 +647,7 @@ std::vector ResampleClause::process(std::vectorat(0)->start(), row_slices.front().row_ranges_->at(0)->start() + output_index_column->row_count()); + ColRange output_col_range(1, aggregators_.size() + 1); seg.add_column(scalar_field(DataType::NANOSECONDS_UTC64, index_column_name), output_index_column); seg.descriptor().set_index(IndexDescriptorImpl(1, IndexDescriptor::Type::TIMESTAMP)); auto& string_pool = seg.string_pool(); @@ -675,7 +676,7 @@ std::vector ResampleClause::process(std::vectortype().data_type(), aggregator.get_output_column_name().value), aggregated_column); } seg.set_row_data(output_index_column->row_count() - 1); - return push_entities(*component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range))); + return push_entities(*component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range), std::move(output_col_range))); } template diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 7431768606..13e188fc2c 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1151,6 +1151,84 @@ std::vector> LocalVersionedEngine::ba return read_versions_or_errors; } +StreamDescriptor descriptor_from_segments(std::vector& slice_and_keys) { + internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); + std::sort(std::begin(slice_and_keys), std::end(slice_and_keys), [] (const auto& left, const auto& right) { + return std::tie(left.slice_.row_range, left.slice_.col_range) < std::tie(right.slice_.row_range, right.slice_.col_range); + }); + // TODO: Make more memory efficient by only keeping first element in memory, and comparing others on the fly + // Track the field collections for each row-slice + std::vector field_collections; + RowRange current_row_range(std::numeric_limits::max(), std::numeric_limits::max()); + for (const auto& slice_and_key: slice_and_keys) { + const auto& desc = slice_and_key.segment_->descriptor(); + if (slice_and_key.slice().rows() != current_row_range) { + field_collections.emplace_back(); + for (size_t field_idx = 0; field_idx < desc.data().index().field_count(); ++field_idx) { + field_collections.back().add(desc.fields().ref_at(field_idx)); + } + current_row_range = slice_and_key.slice().rows(); + } + for (size_t field_idx = desc.data().index().field_count(); field_idx < desc.field_count(); ++field_idx) { + field_collections.back().add(desc.fields().ref_at(field_idx)); + } + } + // Ensure they are all the same + // TODO: Relax this requirement + for (size_t idx = 1; idx < field_collections.size(); ++idx) { + schema::check( + field_collections[0] == field_collections[idx], + "Mismatching fields in multi-symbol join: {} != {}", + field_collections[0], + field_collections[idx] + ); + } + + // Set the output descriptor based on the first row slice + StreamDescriptor res = slice_and_keys[0].segment_->descriptor().clone(); + res.fields_ = std::make_shared(std::move(field_collections[0])); + +// auto index_field_count = res.data().index().field_count(); +// const auto& first_row_range = slice_and_keys[0].slice().rows(); +// for (size_t slice_idx = 1; slice_idx < slice_and_keys.size(); ++slice_idx) { +// const auto& slice_and_key = slice_and_keys[slice_idx]; +// if (slice_and_key.slice().rows() == first_row_range) { +// const auto& desc = slice_and_key.segment_->descriptor(); +// schema::check( +// desc.data().index().field_count() == index_field_count, +// "Mismatching index field count {} != {}", +// desc.data().index().field_count(), +// index_field_count); +// for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) { +// res.add_field(desc.fields().ref_at(field_idx)); +// } +// } else { +// // For other row-slices, check that they match the first +// const auto& desc = slice_and_key.segment_->descriptor(); +// schema::check( +// desc.data().index().field_count() == index_field_count, +// "Mismatching index field count {} != {}", +// desc.data().index().field_count(), +// index_field_count); +// for (size_t field_idx = 0; field_idx < index_field_count; ++field_idx) { +// schema::check( +// desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx), +// "Mismatching index fields: {} != {}", +// desc.fields().ref_at(field_idx), +// res.fields().ref_at(field_idx)); +// } +// for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) { +// schema::check( +// desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count), +// "Mismatching fields: {} != {}", +// desc.fields().ref_at(field_idx), +// res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count)); +// } +// } +// } + return res; +} + MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( const std::vector& stream_ids, const std::vector& version_queries, @@ -1179,7 +1257,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto index_key_seg = store->read_sync(*opt_index_key).second; { std::lock_guard lock(*norm_meta_mtx); - pipeline_context->norm_meta_ =std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); + pipeline_context->norm_meta_ = std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); } version_info = VersionedItem(std::move(*opt_index_key)); return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager); @@ -1197,8 +1275,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); return collect_segments(std::move(proc)); }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { - internal::check(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal"); - pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor()); + pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys)); return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable { return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 87773317eb..6da0c5ac72 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -10,6 +10,7 @@ import pytest from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest +from arcticdb.exceptions import SchemaException from arcticdb.options import LibraryOptions from arcticdb.util.test import assert_frame_equal @@ -18,11 +19,32 @@ @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) -def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment): +def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment): lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) - df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)) - df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)) - df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)) + df_1 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + "col2": np.arange(100, 103, dtype=np.int64), + "col3": np.arange(1000, 1003, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3), + ) + df_2 = pd.DataFrame( + { + "col1": np.arange(4, dtype=np.int64), + "col2": np.arange(200, 204, dtype=np.int64), + "col3": np.arange(2000, 2004, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4), + ) + df_3 = pd.DataFrame( + { + "col1": np.arange(5, dtype=np.int64), + "col2": np.arange(300, 305, dtype=np.int64), + "col3": np.arange(3000, 3005, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5), + ) lib.write("sym1", df_1) lib.write("sym2", df_2) lib.write("sym3", df_3) @@ -34,10 +56,35 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) - lazy_df.resample("2000ns").agg({"col": "sum"}) + lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) received = lazy_df.collect().data - expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"}) + received = received.reindex(columns=sorted(received.columns)) + expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) assert_frame_equal(expected, received) +def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory): + lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) + df_1 = pd.DataFrame({"col1": [0], "col3": [1]}) + df_2 = pd.DataFrame({"col2": [2], "col3": [3]}) + df_3 = pd.DataFrame({"col1": [4], "col4": [5]}) + df_4 = pd.DataFrame({"col1": [6], "col3": [7], "col5": [8], "col6": [9]}) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + lib.write("sym3", df_3) + lib.write("sym4", df_4) + + # First column different + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect() + # Second column different + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym1", "sym3"], lazy=True)).collect() + # First row slice with extra column slice + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym4", "sym1"], lazy=True)).collect() + # Second row slice with extra column slice + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect() + From 91c0d571bc696330fd6c0ba497cc1861cbefb9c1 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 15:19:08 +0000 Subject: [PATCH 26/36] Extra tests for non-identical columns between symbols --- .../version/local_versioned_engine.cpp | 39 +------------------ .../test_symbol_concatenation.py | 17 +++++--- 2 files changed, 12 insertions(+), 44 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 13e188fc2c..a1503d857a 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1156,6 +1156,7 @@ StreamDescriptor descriptor_from_segments(std::vector& slice_and_ke std::sort(std::begin(slice_and_keys), std::end(slice_and_keys), [] (const auto& left, const auto& right) { return std::tie(left.slice_.row_range, left.slice_.col_range) < std::tie(right.slice_.row_range, right.slice_.col_range); }); + // TODO: Move this check to ::structure_for_processing, as it could be different for different clauses // TODO: Make more memory efficient by only keeping first element in memory, and comparing others on the fly // Track the field collections for each row-slice std::vector field_collections; @@ -1188,44 +1189,6 @@ StreamDescriptor descriptor_from_segments(std::vector& slice_and_ke StreamDescriptor res = slice_and_keys[0].segment_->descriptor().clone(); res.fields_ = std::make_shared(std::move(field_collections[0])); -// auto index_field_count = res.data().index().field_count(); -// const auto& first_row_range = slice_and_keys[0].slice().rows(); -// for (size_t slice_idx = 1; slice_idx < slice_and_keys.size(); ++slice_idx) { -// const auto& slice_and_key = slice_and_keys[slice_idx]; -// if (slice_and_key.slice().rows() == first_row_range) { -// const auto& desc = slice_and_key.segment_->descriptor(); -// schema::check( -// desc.data().index().field_count() == index_field_count, -// "Mismatching index field count {} != {}", -// desc.data().index().field_count(), -// index_field_count); -// for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) { -// res.add_field(desc.fields().ref_at(field_idx)); -// } -// } else { -// // For other row-slices, check that they match the first -// const auto& desc = slice_and_key.segment_->descriptor(); -// schema::check( -// desc.data().index().field_count() == index_field_count, -// "Mismatching index field count {} != {}", -// desc.data().index().field_count(), -// index_field_count); -// for (size_t field_idx = 0; field_idx < index_field_count; ++field_idx) { -// schema::check( -// desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx), -// "Mismatching index fields: {} != {}", -// desc.fields().ref_at(field_idx), -// res.fields().ref_at(field_idx)); -// } -// for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) { -// schema::check( -// desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count), -// "Mismatching fields: {} != {}", -// desc.fields().ref_at(field_idx), -// res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count)); -// } -// } -// } return res; } diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 6da0c5ac72..f3a31a6bd1 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -64,16 +64,19 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p assert_frame_equal(expected, received) -def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory): +@pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]]) +def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index): lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) - df_1 = pd.DataFrame({"col1": [0], "col3": [1]}) - df_2 = pd.DataFrame({"col2": [2], "col3": [3]}) - df_3 = pd.DataFrame({"col1": [4], "col4": [5]}) - df_4 = pd.DataFrame({"col1": [6], "col3": [7], "col5": [8], "col6": [9]}) + df_1 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index) + df_2 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index) + df_3 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index) + df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index) + df_5 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index) lib.write("sym1", df_1) lib.write("sym2", df_2) lib.write("sym3", df_3) lib.write("sym4", df_4) + lib.write("sym5", df_5) # First column different with pytest.raises(SchemaException): @@ -87,4 +90,6 @@ def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory): # Second row slice with extra column slice with pytest.raises(SchemaException): concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect() - + # Row slices differ only in second column slice + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect() From cf13b8f44672ae492f7984def7bf6524ad72c855 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 15:45:54 +0000 Subject: [PATCH 27/36] Test exception thrown with mixed index types --- .../test_symbol_concatenation.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index f3a31a6bd1..eb8b841cf5 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -17,6 +17,46 @@ pytestmark = pytest.mark.pipeline +@pytest.mark.parametrize("rows_per_segment", [2, 100_000]) +@pytest.mark.parametrize("columns_per_segment", [2, 100_000]) +@pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)]) +def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment, index): + lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + df_1 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + "col2": np.arange(100, 103, dtype=np.int64), + "col3": np.arange(1000, 1003, dtype=np.int64), + }, + index=index[:3] if index is not None else None, + ) + df_2 = pd.DataFrame( + { + "col1": np.arange(4, dtype=np.int64), + "col2": np.arange(200, 204, dtype=np.int64), + "col3": np.arange(2000, 2004, dtype=np.int64), + }, + index=index[3:7] if index is not None else None, + ) + df_3 = pd.DataFrame( + { + "col1": np.arange(5, dtype=np.int64), + "col2": np.arange(300, 305, dtype=np.int64), + "col3": np.arange(3000, 3005, dtype=np.int64), + }, + index=index[7:] if index is not None else None, + ) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + lib.write("sym3", df_3) + + received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data + expected = pd.concat([df_1, df_2, df_3]) + if index is None: + expected.index = pd.RangeIndex(len(expected)) + assert_frame_equal(expected, received) + + @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment): @@ -93,3 +133,36 @@ def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, inde # Row slices differ only in second column slice with pytest.raises(SchemaException): concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect() + + +def test_symbol_concat_symbols_with_different_indexes(lmdb_library): + lib = lmdb_library + df_1 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1)) + df_2 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)]) + dt1 = pd.Timestamp(0) + dt2 = pd.Timestamp(1) + arr1 = [dt1, dt1, dt2, dt2] + arr2 = [0, 1, 0, 1] + df_3 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"])) + + lib.write("range_index_sym", df_1) + lib.write("timestamp_index_sym", df_2) + lib.write("multiindex_sym", df_3) + + with pytest.raises(SchemaException): + concat(lib.read_batch(["range_index_sym", "timestamp_index_sym"], lazy=True)).collect() + + with pytest.raises(SchemaException): + concat(lib.read_batch(["timestamp_index_sym", "range_index_sym"], lazy=True)).collect() + + with pytest.raises(SchemaException): + concat(lib.read_batch(["range_index_sym", "multiindex_sym"], lazy=True)).collect() + + with pytest.raises(SchemaException): + concat(lib.read_batch(["multiindex_sym", "range_index_sym"], lazy=True)).collect() + + with pytest.raises(SchemaException): + concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect() + + with pytest.raises(SchemaException): + concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect() From 3dc2020407ae00b02fd28724728c9d4fc4a7982d Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 15:52:40 +0000 Subject: [PATCH 28/36] Test concatenating multiindexed data --- .../test_symbol_concatenation.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index eb8b841cf5..1b6e531fe9 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -57,6 +57,26 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per assert_frame_equal(expected, received) +@pytest.mark.parametrize("rows_per_segment", [2, 100_000]) +@pytest.mark.parametrize("columns_per_segment", [2, 100_000]) +def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, columns_per_segment): + lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + df = pd.DataFrame( + { + "col1": np.arange(12, dtype=np.int64), + "col2": np.arange(100, 112, dtype=np.int64), + "col3": np.arange(1000, 1012, dtype=np.int64), + }, + index=pd.MultiIndex.from_product([pd.date_range("2025-01-01", periods=4), [0, 1, 2]], names=["datetime", "level"]), + ) + lib.write("sym1", df[:3]) + lib.write("sym2", df[3:7]) + lib.write("sym3", df[7:]) + + received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data + assert_frame_equal(df, received) + + @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment): From e9f5eeb8597cc13e041005c46d7c4b8d82c0b5f8 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 16:04:00 +0000 Subject: [PATCH 29/36] Test pickled data behaviour --- cpp/arcticdb/version/local_versioned_engine.cpp | 3 +++ cpp/arcticdb/version/version_core.cpp | 4 ++-- .../version_store/test_symbol_concatenation.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index a1503d857a..2aed2c2d77 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1216,10 +1216,13 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( pipeline_context, norm_meta_mtx](auto&& opt_index_key) mutable { std::variant version_info; + // TODO: Add support for symbols that only have incomplete segments internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); + // TODO: Only read the index segment once auto index_key_seg = store->read_sync(*opt_index_key).second; { std::lock_guard lock(*norm_meta_mtx); + // TODO: Construct this at the end of the pipeline, instead of reusing input data pipeline_context->norm_meta_ = std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); } version_info = VersionedItem(std::move(*opt_index_key)); diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 3749f46cff..3012e325d5 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -1596,7 +1596,7 @@ folly::Future> do_process( const ReadOptions& read_options, const std::shared_ptr& pipeline_context, std::shared_ptr component_manager) { - util::check_rte(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data"); + schema::check(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data"); return read_and_process_2(store, pipeline_context, read_query, read_options, component_manager); } @@ -2110,7 +2110,7 @@ folly::Future> read_entity_ids_for_version( read_indexed_keys_to_pipeline(store, pipeline_context, std::get(version_info), *read_query, read_options); } - user_input::check(!pipeline_context->multi_key_, "Multi-symbol joines not supported with recursively normalized data"); + user_input::check(!pipeline_context->multi_key_, "Multi-symbol joins not supported with recursively normalized data"); if(opt_false(read_options.incompletes_)) { util::check(std::holds_alternative(read_query->row_filter), "Streaming read requires date range filter"); diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 1b6e531fe9..8c8eb8c2d5 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -186,3 +186,14 @@ def test_symbol_concat_symbols_with_different_indexes(lmdb_library): with pytest.raises(SchemaException): concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect() + + +def test_symbol_concat_pickled_data(lmdb_library): + lib = lmdb_library + df = pd.DataFrame({"bytes": np.arange(10, dtype=np.uint64)}) + pickled_data = {"hi", "there"} + lib.write("sym1", df) + lib.write_pickle("sym2", pickled_data) + + with pytest.raises(SchemaException): + concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect() From 9ed8d7c8acc353811c330b38bbe0015a2955bab0 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 16:17:08 +0000 Subject: [PATCH 30/36] Test date_range provided via read arg and via clause --- .../version/local_versioned_engine.cpp | 5 ++- .../test_symbol_concatenation.py | 32 +++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 2aed2c2d77..c720b12212 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1175,7 +1175,10 @@ StreamDescriptor descriptor_from_segments(std::vector& slice_and_ke } } // Ensure they are all the same - // TODO: Relax this requirement + // TODO: Relax this requirement: + // - columns in different orders + // - columns of different but promotable types + // - extra/missing columns in different row slices for (size_t idx = 1; idx < field_collections.size(); ++idx) { schema::check( field_collections[0] == field_collections[idx], diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 8c8eb8c2d5..f53e2aebe6 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -77,6 +77,34 @@ def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, column assert_frame_equal(df, received) +def test_symbol_concat_with_date_range(lmdb_library): + lib = lmdb_library + df_1 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3), + ) + df_2 = pd.DataFrame( + { + "col1": np.arange(4, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(1000), freq="1000ns", periods=4), + ) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + + # Use date_range arg to trim last row from sym1 + lazy_df_1 = lib.read("sym1", date_range=(None, pd.Timestamp(1000)), lazy=True) + # Use date_range clause to trim first row from sym2 + lazy_df_2 = lib.read("sym2", lazy=True) + lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(2000), None)) + + received = concat([lazy_df_1, lazy_df_2]).collect().data + expected = pd.concat([df_1[:2], df_2[1:]]) + assert_frame_equal(expected, received) + + @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment): @@ -112,7 +140,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p lazy_df_1 = lib.read("sym1", lazy=True) lazy_df_2 = lib.read("sym2", lazy=True) lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) - lazy_df_3 = lib.read("sym3", lazy=True) + lazy_df_3 = lib.read("sym3", date_range=(None, pd.Timestamp(9000)), lazy=True) lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) @@ -120,7 +148,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p received = lazy_df.collect().data received = received.reindex(columns=sorted(received.columns)) - expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + expected = pd.concat([df_1, df_2[1:], df_3[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) assert_frame_equal(expected, received) From 7c7e52fa86a30c0db8b7c4a8cacfec221e8f5851 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Wed, 29 Jan 2025 16:49:00 +0000 Subject: [PATCH 31/36] Add failing test with column slicing --- .../test_symbol_concatenation.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index f53e2aebe6..e909f1296a 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -57,6 +57,40 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per assert_frame_equal(expected, received) +# TODO: Get working with column slicing +@pytest.mark.xfail(reason="Not yet working with column slicing") +@pytest.mark.parametrize("rows_per_segment", [2, 100_000]) +@pytest.mark.parametrize("columns_per_segment", [2, 100_000]) +@pytest.mark.parametrize("columns", [["col1"], ["col2"], ["col3"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]) +def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, columns_per_segment, columns): + lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) + df_1 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + "col2": np.arange(100, 103, dtype=np.int64), + "col3": np.arange(1000, 1003, dtype=np.int64), + }, + ) + df_2 = pd.DataFrame( + { + "col0": np.arange(10, 14, dtype=np.int64), + "col1": np.arange(4, dtype=np.int64), + "col2": np.arange(200, 204, dtype=np.int64), + "col3": np.arange(2000, 2004, dtype=np.int64), + }, + ) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + + lazy_df_1 = lib.read("sym1", columns=columns, lazy=True) + lazy_df_2 = lib.read("sym2", columns=columns, lazy=True) + + received = concat([lazy_df_1, lazy_df_2]).collect().data + expected = pd.concat([df_1.loc[:, columns], df_2.loc[:, columns]]) + expected.index = pd.RangeIndex(len(expected)) + assert_frame_equal(expected, received) + + @pytest.mark.parametrize("rows_per_segment", [2, 100_000]) @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, columns_per_segment): From 4280c562a4d12e41c9ccecda0f5a7aedd87e35d7 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 30 Jan 2025 11:17:11 +0000 Subject: [PATCH 32/36] Return list of symbol/version pairs of input symbols when joining --- cpp/arcticdb/entity/read_result.hpp | 6 +- .../version/local_versioned_engine.cpp | 15 ++- cpp/arcticdb/version/version_store_api.cpp | 2 +- python/arcticdb/version_store/_store.py | 47 +++++-- python/arcticdb/version_store/library.py | 6 +- .../test_symbol_concatenation.py | 116 +++++++++--------- 6 files changed, 115 insertions(+), 77 deletions(-) diff --git a/cpp/arcticdb/entity/read_result.hpp b/cpp/arcticdb/entity/read_result.hpp index 526407f073..e05f2f3e41 100644 --- a/cpp/arcticdb/entity/read_result.hpp +++ b/cpp/arcticdb/entity/read_result.hpp @@ -21,7 +21,7 @@ namespace arcticdb { struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { ReadResult( - const VersionedItem& versioned_item, + const std::variant>& versioned_item, pipelines::PythonOutputFrame&& frame_data, const arcticdb::proto::descriptors::NormalizationMetadata& norm_meta, const arcticdb::proto::descriptors::UserDefinedMetadata& user_meta, @@ -35,7 +35,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { multi_keys(std::move(multi_keys)) { } - VersionedItem item; + std::variant> item; pipelines::PythonOutputFrame frame_data; arcticdb::proto::descriptors::NormalizationMetadata norm_meta; arcticdb::proto::descriptors::UserDefinedMetadata user_meta; @@ -46,7 +46,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { }; inline ReadResult create_python_read_result( - const VersionedItem& version, + const std::variant>& version, FrameAndDescriptor&& fd) { auto result = std::move(fd); // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index c720b12212..335c2adbe8 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1209,15 +1209,19 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto component_manager = std::make_shared(); auto pipeline_context = std::make_shared(); auto norm_meta_mtx = std::make_shared(); + // TODO: Generate this in a less hacky way + auto res_versioned_items = std::make_shared>(stream_ids.size()); DecodePathData shared_data; for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) { symbol_entities_futs.emplace_back( std::move(opt_index_key_fut).thenValue([store = store(), read_query = read_queries.empty() ? std::make_shared(): read_queries[idx], + idx, &read_options, &component_manager, pipeline_context, - norm_meta_mtx](auto&& opt_index_key) mutable { + norm_meta_mtx, + res_versioned_items](auto&& opt_index_key) mutable { std::variant version_info; // TODO: Add support for symbols that only have incomplete segments internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); @@ -1228,6 +1232,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( // TODO: Construct this at the end of the pipeline, instead of reusing input data pipeline_context->norm_meta_ = std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); } + // Shouldn't need mutex protecting as vector is presized and each element is only accessed once + (*res_versioned_items)[idx] = VersionedItem(*opt_index_key); version_info = VersionedItem(std::move(*opt_index_key)); return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager); }) @@ -1244,12 +1250,13 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto proc = gather_entities, std::shared_ptr, std::shared_ptr>(*component_manager, std::move(processed_entity_ids)); return collect_segments(std::move(proc)); }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector&& slice_and_keys) { + // TODO: Make constructing the output descriptor a standard end of pipeline operation pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys)); return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); - }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable { + }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items](SegmentInMemory&& frame) mutable { return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) - .thenValue([pipeline_context, frame, shared_data](auto&&) mutable { - return MultiSymbolReadOutput{{}, + .thenValue([pipeline_context, frame, shared_data, res_versioned_items](auto&&) mutable { + return MultiSymbolReadOutput{std::move(*res_versioned_items), {frame, timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_), {}, diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index 917d259526..283cfe7cba 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -795,7 +795,7 @@ ReadResult PythonVersionStore::batch_read_with_join( std::vector>&& clauses, std::any& handler_data) { auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data); - return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_)); + return create_python_read_result(versions_and_frame.versioned_items_, std::move(versions_and_frame.frame_and_descriptor_)); } void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) { diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 32a32360a5..0f488a1343 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -7,6 +7,7 @@ """ import copy +from dataclasses import dataclass import datetime import os import sys @@ -124,6 +125,12 @@ def __iter__(self): # Backwards compatible with the old NamedTuple implementati return iter(attr.astuple(self)) +@dataclass +class VersionedItemWithJoin: + versions: List[VersionedItem] + data: Any + + def _env_config_from_lib_config(lib_cfg, env): cfg = EnvironmentConfigsMap() e = cfg.env_by_id[env] @@ -2094,22 +2101,42 @@ def _get_index_columns_from_descriptor(descriptor): return index_columns - def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem: + def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, VersionedItemWithJoin]: frame_data = FrameData.from_cpp(read_result.frame_data) + # TODO: Return metadata for all symbol/version pairs as well meta = denormalize_user_metadata(read_result.udm, self._normalizer) data = self._normalizer.denormalize(frame_data, read_result.norm) if read_result.norm.HasField("custom"): data = self._custom_normalizer.denormalize(data, read_result.norm.custom) - return VersionedItem( - symbol=read_result.version.symbol, - library=self._library.library_path, - data=data, - version=read_result.version.version, - metadata=meta, - host=self.env, - timestamp=read_result.version.timestamp, - ) + if isinstance(read_result.version, list): + versions = [] + for idx in range(len(read_result.version)): + versions.append( + VersionedItem( + symbol=read_result.version[idx].symbol, + library=self._library.library_path, + data=None, + version=read_result.version[idx].version, + metadata=meta, + host=self.env, + timestamp=read_result.version[idx].timestamp, + ) + ) + return VersionedItemWithJoin( + versions=versions, + data=data, + ) + else: + return VersionedItem( + symbol=read_result.version.symbol, + library=self._library.library_path, + data=data, + version=read_result.version.version, + metadata=meta, + host=self.env, + timestamp=read_result.version.timestamp, + ) def list_versions( self, diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index f8bca21629..854d53358a 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -20,7 +20,7 @@ from arcticdb.util._versions import IS_PANDAS_TWO from arcticdb.version_store.processing import ExpressionNode, QueryBuilder -from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput +from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionedItemWithJoin, VersionQueryInput from arcticdb_ext.exceptions import ArcticException from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause import pandas as pd @@ -488,7 +488,7 @@ def __init__( self._lazy_dataframes = lazy_dataframes self._join = join - def collect(self) -> VersionedItem: + def collect(self) -> VersionedItemWithJoin: if not len(self._lazy_dataframes._lazy_dataframes): return [] else: @@ -1679,7 +1679,7 @@ def _read_batch_with_join( read_requests: List[ReadRequest], join: Any, query_builder: Optional[QueryBuilder] = None, - ) -> VersionedItem: + ) -> VersionedItemWithJoin: symbol_strings = [] as_ofs = [] date_ranges = [] diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index e909f1296a..df77e1a8cf 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -22,7 +22,7 @@ @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)]) def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment, index): lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) - df_1 = pd.DataFrame( + df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), "col2": np.arange(100, 103, dtype=np.int64), @@ -30,7 +30,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per }, index=index[:3] if index is not None else None, ) - df_2 = pd.DataFrame( + df_1 = pd.DataFrame( { "col1": np.arange(4, dtype=np.int64), "col2": np.arange(200, 204, dtype=np.int64), @@ -38,7 +38,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per }, index=index[3:7] if index is not None else None, ) - df_3 = pd.DataFrame( + df_2 = pd.DataFrame( { "col1": np.arange(5, dtype=np.int64), "col2": np.arange(300, 305, dtype=np.int64), @@ -46,15 +46,19 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per }, index=index[7:] if index is not None else None, ) + lib.write("sym0", df_0) lib.write("sym1", df_1) lib.write("sym2", df_2) - lib.write("sym3", df_3) - received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data - expected = pd.concat([df_1, df_2, df_3]) + received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect() + expected = pd.concat([df_0, df_1, df_2]) if index is None: expected.index = pd.RangeIndex(len(expected)) - assert_frame_equal(expected, received) + assert_frame_equal(expected, received.data) + for idx, version in enumerate(received.versions): + assert version.symbol == f"sym{idx}" + assert version.version == 0 + assert version.data is None # TODO: Get working with column slicing @@ -64,14 +68,14 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per @pytest.mark.parametrize("columns", [["col1"], ["col2"], ["col3"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]) def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, columns_per_segment, columns): lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) - df_1 = pd.DataFrame( + df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), "col2": np.arange(100, 103, dtype=np.int64), "col3": np.arange(1000, 1003, dtype=np.int64), }, ) - df_2 = pd.DataFrame( + df_1 = pd.DataFrame( { "col0": np.arange(10, 14, dtype=np.int64), "col1": np.arange(4, dtype=np.int64), @@ -79,14 +83,14 @@ def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, co "col3": np.arange(2000, 2004, dtype=np.int64), }, ) + lib.write("sym0", df_0) lib.write("sym1", df_1) - lib.write("sym2", df_2) + lazy_df_0 = lib.read("sym0", columns=columns, lazy=True) lazy_df_1 = lib.read("sym1", columns=columns, lazy=True) - lazy_df_2 = lib.read("sym2", columns=columns, lazy=True) - received = concat([lazy_df_1, lazy_df_2]).collect().data - expected = pd.concat([df_1.loc[:, columns], df_2.loc[:, columns]]) + received = concat([lazy_df_0, lazy_df_1]).collect().data + expected = pd.concat([df_0.loc[:, columns], df_1.loc[:, columns]]) expected.index = pd.RangeIndex(len(expected)) assert_frame_equal(expected, received) @@ -103,39 +107,39 @@ def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, column }, index=pd.MultiIndex.from_product([pd.date_range("2025-01-01", periods=4), [0, 1, 2]], names=["datetime", "level"]), ) - lib.write("sym1", df[:3]) - lib.write("sym2", df[3:7]) - lib.write("sym3", df[7:]) + lib.write("sym0", df[:3]) + lib.write("sym1", df[3:7]) + lib.write("sym2", df[7:]) - received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data + received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect().data assert_frame_equal(df, received) def test_symbol_concat_with_date_range(lmdb_library): lib = lmdb_library - df_1 = pd.DataFrame( + df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), }, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3), ) - df_2 = pd.DataFrame( + df_1 = pd.DataFrame( { "col1": np.arange(4, dtype=np.int64), }, index=pd.date_range(pd.Timestamp(1000), freq="1000ns", periods=4), ) + lib.write("sym0", df_0) lib.write("sym1", df_1) - lib.write("sym2", df_2) - # Use date_range arg to trim last row from sym1 - lazy_df_1 = lib.read("sym1", date_range=(None, pd.Timestamp(1000)), lazy=True) - # Use date_range clause to trim first row from sym2 - lazy_df_2 = lib.read("sym2", lazy=True) - lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(2000), None)) + # Use date_range arg to trim last row from sym0 + lazy_df_0 = lib.read("sym0", date_range=(None, pd.Timestamp(1000)), lazy=True) + # Use date_range clause to trim first row from sym1 + lazy_df_1 = lib.read("sym1", lazy=True) + lazy_df_1 = lazy_df_1.date_range((pd.Timestamp(2000), None)) - received = concat([lazy_df_1, lazy_df_2]).collect().data - expected = pd.concat([df_1[:2], df_2[1:]]) + received = concat([lazy_df_0, lazy_df_1]).collect().data + expected = pd.concat([df_0[:2], df_1[1:]]) assert_frame_equal(expected, received) @@ -143,7 +147,7 @@ def test_symbol_concat_with_date_range(lmdb_library): @pytest.mark.parametrize("columns_per_segment", [2, 100_000]) def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment): lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment)) - df_1 = pd.DataFrame( + df_0 = pd.DataFrame( { "col1": np.arange(3, dtype=np.int64), "col2": np.arange(100, 103, dtype=np.int64), @@ -151,7 +155,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p }, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3), ) - df_2 = pd.DataFrame( + df_1 = pd.DataFrame( { "col1": np.arange(4, dtype=np.int64), "col2": np.arange(200, 204, dtype=np.int64), @@ -159,7 +163,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p }, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4), ) - df_3 = pd.DataFrame( + df_2 = pd.DataFrame( { "col1": np.arange(5, dtype=np.int64), "col2": np.arange(300, 305, dtype=np.int64), @@ -167,69 +171,69 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p }, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5), ) + lib.write("sym0", df_0) lib.write("sym1", df_1) lib.write("sym2", df_2) - lib.write("sym3", df_3) + lazy_df_0 = lib.read("sym0", lazy=True) lazy_df_1 = lib.read("sym1", lazy=True) - lazy_df_2 = lib.read("sym2", lazy=True) - lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) - lazy_df_3 = lib.read("sym3", date_range=(None, pd.Timestamp(9000)), lazy=True) + lazy_df_1 = lazy_df_1.date_range((pd.Timestamp(pd.Timestamp(3000)), None)) + lazy_df_2 = lib.read("sym2", date_range=(None, pd.Timestamp(9000)), lazy=True) - lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3]) + lazy_df = concat([lazy_df_0, lazy_df_1, lazy_df_2]) lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) received = lazy_df.collect().data received = received.reindex(columns=sorted(received.columns)) - expected = pd.concat([df_1, df_2[1:], df_3[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) assert_frame_equal(expected, received) @pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]]) def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index): lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) - df_1 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index) - df_2 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index) - df_3 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index) - df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index) - df_5 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index) + df_0 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index) + df_1 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index) + df_2 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index) + df_3 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index) + df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index) + lib.write("sym0", df_0) lib.write("sym1", df_1) lib.write("sym2", df_2) lib.write("sym3", df_3) lib.write("sym4", df_4) - lib.write("sym5", df_5) # First column different with pytest.raises(SchemaException): - concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect() + concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect() # Second column different with pytest.raises(SchemaException): - concat(lib.read_batch(["sym1", "sym3"], lazy=True)).collect() + concat(lib.read_batch(["sym0", "sym2"], lazy=True)).collect() # First row slice with extra column slice with pytest.raises(SchemaException): - concat(lib.read_batch(["sym4", "sym1"], lazy=True)).collect() + concat(lib.read_batch(["sym3", "sym0"], lazy=True)).collect() # Second row slice with extra column slice with pytest.raises(SchemaException): - concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect() + concat(lib.read_batch(["sym0", "sym3"], lazy=True)).collect() # Row slices differ only in second column slice with pytest.raises(SchemaException): - concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect() + concat(lib.read_batch(["sym3", "sym4"], lazy=True)).collect() def test_symbol_concat_symbols_with_different_indexes(lmdb_library): lib = lmdb_library - df_1 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1)) - df_2 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)]) + df_0 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1)) + df_1 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)]) dt1 = pd.Timestamp(0) dt2 = pd.Timestamp(1) arr1 = [dt1, dt1, dt2, dt2] arr2 = [0, 1, 0, 1] - df_3 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"])) + df_2 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"])) - lib.write("range_index_sym", df_1) - lib.write("timestamp_index_sym", df_2) - lib.write("multiindex_sym", df_3) + lib.write("range_index_sym", df_0) + lib.write("timestamp_index_sym", df_1) + lib.write("multiindex_sym", df_2) with pytest.raises(SchemaException): concat(lib.read_batch(["range_index_sym", "timestamp_index_sym"], lazy=True)).collect() @@ -254,8 +258,8 @@ def test_symbol_concat_pickled_data(lmdb_library): lib = lmdb_library df = pd.DataFrame({"bytes": np.arange(10, dtype=np.uint64)}) pickled_data = {"hi", "there"} - lib.write("sym1", df) - lib.write_pickle("sym2", pickled_data) + lib.write("sym0", df) + lib.write_pickle("sym1", pickled_data) with pytest.raises(SchemaException): - concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect() + concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect() From 24b62e2246af87757959a27425c6a6a83575d19f Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 30 Jan 2025 12:22:13 +0000 Subject: [PATCH 33/36] Per-symbol metadatas returned to user --- cpp/arcticdb/entity/read_result.hpp | 15 +++++++++++---- cpp/arcticdb/python/adapt_read_dataframe.hpp | 13 ++++++++++++- cpp/arcticdb/python/python_utils.hpp | 2 +- cpp/arcticdb/version/local_versioned_engine.cpp | 16 ++++++++++------ cpp/arcticdb/version/version_core.hpp | 7 ++++++- cpp/arcticdb/version/version_store_api.cpp | 6 +++++- python/arcticdb/version_store/_store.py | 6 ++---- .../version_store/test_symbol_concatenation.py | 5 +++-- 8 files changed, 50 insertions(+), 20 deletions(-) diff --git a/cpp/arcticdb/entity/read_result.hpp b/cpp/arcticdb/entity/read_result.hpp index e05f2f3e41..33dfb64af7 100644 --- a/cpp/arcticdb/entity/read_result.hpp +++ b/cpp/arcticdb/entity/read_result.hpp @@ -24,7 +24,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { const std::variant>& versioned_item, pipelines::PythonOutputFrame&& frame_data, const arcticdb::proto::descriptors::NormalizationMetadata& norm_meta, - const arcticdb::proto::descriptors::UserDefinedMetadata& user_meta, + const std::variant>& user_meta, const arcticdb::proto::descriptors::UserDefinedMetadata& multi_key_meta, std::vector&& multi_keys) : item(versioned_item), @@ -38,7 +38,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { std::variant> item; pipelines::PythonOutputFrame frame_data; arcticdb::proto::descriptors::NormalizationMetadata norm_meta; - arcticdb::proto::descriptors::UserDefinedMetadata user_meta; + std::variant> user_meta; arcticdb::proto::descriptors::UserDefinedMetadata multi_key_meta; std::vector multi_keys; @@ -47,7 +47,8 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult { inline ReadResult create_python_read_result( const std::variant>& version, - FrameAndDescriptor&& fd) { + FrameAndDescriptor&& fd, + std::optional>&& user_meta = std::nullopt) { auto result = std::move(fd); // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored // (renamed from is_not_range_index) was false, the index was always RangeIndex(num_rows, 1) @@ -73,7 +74,13 @@ inline ReadResult create_python_read_result( util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__); const auto& desc_proto = result.desc_.proto(); + std::variant> metadata; + if (user_meta.has_value()) { + metadata = *user_meta; + } else { + metadata = desc_proto.user_meta(); + } return {version, std::move(python_frame), desc_proto.normalization(), - desc_proto.user_meta(), desc_proto.multi_key_meta(), std::move(result.keys_)}; + metadata, desc_proto.multi_key_meta(), std::move(result.keys_)}; } } //namespace arcticdb \ No newline at end of file diff --git a/cpp/arcticdb/python/adapt_read_dataframe.hpp b/cpp/arcticdb/python/adapt_read_dataframe.hpp index 65f59b7658..b6594c3bfa 100644 --- a/cpp/arcticdb/python/adapt_read_dataframe.hpp +++ b/cpp/arcticdb/python/adapt_read_dataframe.hpp @@ -14,7 +14,18 @@ namespace arcticdb { inline auto adapt_read_df = [](ReadResult && ret) -> py::tuple{ auto pynorm = python_util::pb_to_python(ret.norm_meta); - auto pyuser_meta = python_util::pb_to_python(ret.user_meta); + auto pyuser_meta = util::variant_match( + ret.user_meta, + [](const arcticdb::proto::descriptors::UserDefinedMetadata& metadata) -> py::object { + return python_util::pb_to_python(metadata); + }, + [](const std::vector& metadatas) -> py::object { + py::list py_metadatas; + for (const auto& metadata: metadatas) { + py_metadatas.append(python_util::pb_to_python(metadata)); + } + return py_metadatas; + }); auto multi_key_meta = python_util::pb_to_python(ret.multi_key_meta); return py::make_tuple(ret.item, std::move(ret.frame_data), pynorm, pyuser_meta, multi_key_meta, ret.multi_keys); }; diff --git a/cpp/arcticdb/python/python_utils.hpp b/cpp/arcticdb/python/python_utils.hpp index f24228b1cb..f9e803944c 100644 --- a/cpp/arcticdb/python/python_utils.hpp +++ b/cpp/arcticdb/python/python_utils.hpp @@ -252,7 +252,7 @@ inline py::list adapt_read_dfs(std::vector>& res, [&lst] (ReadResult& read_result) { auto pynorm = python_util::pb_to_python(read_result.norm_meta); - auto pyuser_meta = python_util::pb_to_python(read_result.user_meta); + auto pyuser_meta = python_util::pb_to_python(std::get(read_result.user_meta)); auto multi_key_meta = python_util::pb_to_python(read_result.multi_key_meta); lst.append(py::make_tuple(read_result.item, std::move(read_result.frame_data), pynorm, pyuser_meta, multi_key_meta, read_result.multi_keys)); diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp index 335c2adbe8..c109631f88 100644 --- a/cpp/arcticdb/version/local_versioned_engine.cpp +++ b/cpp/arcticdb/version/local_versioned_engine.cpp @@ -1209,8 +1209,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( auto component_manager = std::make_shared(); auto pipeline_context = std::make_shared(); auto norm_meta_mtx = std::make_shared(); - // TODO: Generate this in a less hacky way + // TODO: Generate these in a less hacky way auto res_versioned_items = std::make_shared>(stream_ids.size()); + auto res_metadatas = std::make_shared>(stream_ids.size()); DecodePathData shared_data; for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) { symbol_entities_futs.emplace_back( @@ -1221,7 +1222,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( &component_manager, pipeline_context, norm_meta_mtx, - res_versioned_items](auto&& opt_index_key) mutable { + res_versioned_items, + res_metadatas](auto&& opt_index_key) mutable { std::variant version_info; // TODO: Add support for symbols that only have incomplete segments internal::check(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data"); @@ -1232,8 +1234,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( // TODO: Construct this at the end of the pipeline, instead of reusing input data pipeline_context->norm_meta_ = std::make_unique(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization())); } - // Shouldn't need mutex protecting as vector is presized and each element is only accessed once - (*res_versioned_items)[idx] = VersionedItem(*opt_index_key); + // Shouldn't need mutex protecting as vectors are presized and each element is only accessed once + res_versioned_items->at(idx) = VersionedItem(*opt_index_key); + res_metadatas->at(idx) = index_key_seg.mutable_index_descriptor().user_metadata(); version_info = VersionedItem(std::move(*opt_index_key)); return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager); }) @@ -1253,10 +1256,11 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal( // TODO: Make constructing the output descriptor a standard end of pipeline operation pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys)); return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data); - }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items](SegmentInMemory&& frame) mutable { + }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items, res_metadatas](SegmentInMemory&& frame) mutable { return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data) - .thenValue([pipeline_context, frame, shared_data, res_versioned_items](auto&&) mutable { + .thenValue([pipeline_context, frame, shared_data, res_versioned_items, res_metadatas](auto&&) mutable { return MultiSymbolReadOutput{std::move(*res_versioned_items), + std::move(*res_metadatas), {frame, timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_), {}, diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp index 408d41acb7..296e56491b 100644 --- a/cpp/arcticdb/version/version_core.hpp +++ b/cpp/arcticdb/version/version_core.hpp @@ -55,13 +55,18 @@ struct ReadVersionOutput { struct MultiSymbolReadOutput { MultiSymbolReadOutput() = delete; - MultiSymbolReadOutput(std::vector&& versioned_items, FrameAndDescriptor&& frame_and_descriptor): + MultiSymbolReadOutput( + std::vector&& versioned_items, + std::vector&& metadatas, + FrameAndDescriptor&& frame_and_descriptor): versioned_items_(std::move(versioned_items)), + metadatas_(std::move(metadatas)), frame_and_descriptor_(std::move(frame_and_descriptor)) {} ARCTICDB_MOVE_ONLY_DEFAULT(MultiSymbolReadOutput) std::vector versioned_items_; + std::vector metadatas_; FrameAndDescriptor frame_and_descriptor_; }; diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp index 283cfe7cba..6ed2ad7b96 100644 --- a/cpp/arcticdb/version/version_store_api.cpp +++ b/cpp/arcticdb/version/version_store_api.cpp @@ -795,7 +795,11 @@ ReadResult PythonVersionStore::batch_read_with_join( std::vector>&& clauses, std::any& handler_data) { auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data); - return create_python_read_result(versions_and_frame.versioned_items_, std::move(versions_and_frame.frame_and_descriptor_)); + return create_python_read_result( + versions_and_frame.versioned_items_, + std::move(versions_and_frame.frame_and_descriptor_), + std::move(versions_and_frame.metadatas_) + ); } void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) { diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 0f488a1343..d1ef308c22 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -2103,8 +2103,6 @@ def _get_index_columns_from_descriptor(descriptor): def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, VersionedItemWithJoin]: frame_data = FrameData.from_cpp(read_result.frame_data) - # TODO: Return metadata for all symbol/version pairs as well - meta = denormalize_user_metadata(read_result.udm, self._normalizer) data = self._normalizer.denormalize(frame_data, read_result.norm) if read_result.norm.HasField("custom"): data = self._custom_normalizer.denormalize(data, read_result.norm.custom) @@ -2118,7 +2116,7 @@ def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, Versi library=self._library.library_path, data=None, version=read_result.version[idx].version, - metadata=meta, + metadata=denormalize_user_metadata(read_result.udm[idx], self._normalizer), host=self.env, timestamp=read_result.version[idx].timestamp, ) @@ -2133,7 +2131,7 @@ def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, Versi library=self._library.library_path, data=data, version=read_result.version.version, - metadata=meta, + metadata=denormalize_user_metadata(read_result.udm, self._normalizer), host=self.env, timestamp=read_result.version.timestamp, ) diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index df77e1a8cf..6fed202941 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -46,9 +46,9 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per }, index=index[7:] if index is not None else None, ) - lib.write("sym0", df_0) + lib.write("sym0", df_0, metadata=0) lib.write("sym1", df_1) - lib.write("sym2", df_2) + lib.write("sym2", df_2, metadata=2) received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect() expected = pd.concat([df_0, df_1, df_2]) @@ -59,6 +59,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per assert version.symbol == f"sym{idx}" assert version.version == 0 assert version.data is None + assert version.metadata == (None if idx == 1 else idx) # TODO: Get working with column slicing From 98b261f6fb396d54789430fb0b1a143082485667 Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 30 Jan 2025 16:12:36 +0000 Subject: [PATCH 34/36] Refactor to not separate ConcatClause from other clauses --- cpp/arcticdb/version/python_bindings.cpp | 37 ++++++++++++--------- python/arcticdb/version_store/_store.py | 4 +-- python/arcticdb/version_store/library.py | 14 ++++---- python/arcticdb/version_store/processing.py | 14 ++++++++ 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index f097678504..b8f4dd2317 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -769,27 +769,34 @@ void register_bindings(py::module &version, py::exception& version_queries, std::vector>& read_queries, const ReadOptions& read_options, - ClauseVariant join, - std::vector post_join_clauses + std::vector clauses ){ - auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); - post_join_clauses = plan_query(std::move(post_join_clauses)); + user_input::check(!clauses.empty(), "batch_read_with_join called with no clauses"); + clauses = plan_query(std::move(clauses)); std::vector> _clauses; - util::variant_match( - join, - [&](auto&& clause) { - user_input::check( - clause->clause_info().multi_symbol_, - "Single-symbol clause cannot be used to join multiple symbols together"); - _clauses.emplace_back(std::make_shared(*clause)); - } - ); - for (auto&& clause: post_join_clauses) { + bool first_clause{true}; + for (auto&& clause: clauses) { util::variant_match( clause, - [&](auto&& clause) {_clauses.emplace_back(std::make_shared(*clause));} + [&](auto&& clause) { + if (first_clause) { + user_input::check( + clause->clause_info().multi_symbol_, + "Single-symbol clause cannot be used to join multiple symbols together"); + _clauses.emplace_back(std::make_shared(*clause)); + first_clause = false; + } else { + // TODO: Add this check to ReadQuery.add_clauses + user_input::check( + !clause->clause_info().multi_symbol_, + "Multi-symbol clause cannot be used on a single symbol"); + _clauses.emplace_back(std::make_shared(*clause)); + } + _clauses.emplace_back(std::make_shared(*clause)); + } ); } + auto handler_data = TypeHandlerRegistry::instance()->get_handler_data(); return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, std::move(_clauses), handler_data)); }, py::call_guard(), "Read a dataframe from the store") diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index d1ef308c22..5c5167b09d 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -1073,7 +1073,7 @@ def _batch_read_to_versioned_items( return versioned_items def _batch_read_with_join( - self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, join, query_builder + self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, query_builder ): implement_read_index = True if columns: @@ -1083,7 +1083,7 @@ def _batch_read_with_join( per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders) read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders) read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False) - return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, join, query_builder.clauses))) + return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, query_builder.clauses))) def batch_read_metadata( self, symbols: List[str], as_ofs: Optional[List[VersionQueryInput]] = None, **kwargs diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index 854d53358a..7b6e4753b6 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -22,7 +22,7 @@ from arcticdb.version_store.processing import ExpressionNode, QueryBuilder from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionedItemWithJoin, VersionQueryInput from arcticdb_ext.exceptions import ArcticException -from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause +from arcticdb_ext.version_store import DataError import pandas as pd import numpy as np import logging @@ -482,22 +482,22 @@ class LazyDataFrameAfterJoin(QueryBuilder): def __init__( self, lazy_dataframes: LazyDataFrameCollection, - join: Any, + join: QueryBuilder, ): super().__init__() self._lazy_dataframes = lazy_dataframes - self._join = join + self.then(join) def collect(self) -> VersionedItemWithJoin: if not len(self._lazy_dataframes._lazy_dataframes): return [] else: lib = self._lazy_dataframes._lib - return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self._join, self) + return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self) def __str__(self) -> str: query_builder_repr = super().__str__() - return f"LazyDataFrameAfterJoin({self._join}({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})" + return f"LazyDataFrameAfterJoin({self._lazy_dataframes._lazy_dataframes} | {query_builder_repr})" def __repr__(self) -> str: return self.__str__() @@ -506,7 +506,7 @@ def __repr__(self) -> str: def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin: if not isinstance(lazy_dataframes, LazyDataFrameCollection): lazy_dataframes = LazyDataFrameCollection(lazy_dataframes) - return LazyDataFrameAfterJoin(lazy_dataframes, _ConcatClause()) + return LazyDataFrameAfterJoin(lazy_dataframes, QueryBuilder().concat()) def col(name: str) -> ExpressionNode: @@ -1677,7 +1677,6 @@ def handle_symbol(s_): def _read_batch_with_join( self, read_requests: List[ReadRequest], - join: Any, query_builder: Optional[QueryBuilder] = None, ) -> VersionedItemWithJoin: symbol_strings = [] @@ -1702,7 +1701,6 @@ def _read_batch_with_join( row_ranges, columns, per_symbol_query_builders, - join, query_builder, ) diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py index f58944376f..993e3d9929 100644 --- a/python/arcticdb/version_store/processing.py +++ b/python/arcticdb/version_store/processing.py @@ -33,6 +33,7 @@ from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause +from arcticdb_ext.version_store import ConcatClause as _ConcatClause from arcticdb_ext.version_store import RowRangeType as _RowRangeType from arcticdb_ext.version_store import ExpressionName as _ExpressionName from arcticdb_ext.version_store import ColumnName as _ColumnName @@ -323,6 +324,12 @@ class PythonResampleClause: origin: Union[str, pd.Timestamp] = "epoch" +# TODO: Test pickling of this +@dataclass +class PythonConcatClause: + pass + + class QueryBuilder: """ Build a query to process read results with. Syntax is designed to be similar to Pandas: @@ -904,6 +911,11 @@ def date_range(self, date_range: DateRangeInput): self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)] return self + def concat(self): + self.clauses = self.clauses + [_ConcatClause()] + self._python_clauses = self._python_clauses + [PythonConcatClause()] + return self + def __eq__(self, right): if not isinstance(right, QueryBuilder): return False @@ -965,6 +977,8 @@ def __setstate__(self, state): self.clauses = self.clauses + [_RowRangeClause(python_clause.row_range_type, python_clause.n)] elif isinstance(python_clause, PythonDateRangeClause): self.clauses = self.clauses + [_DateRangeClause(python_clause.start, python_clause.end)] + elif isinstance(python_clause, PythonConcatClause): + self.clauses = self.clauses + [_ConcatClause()] else: raise ArcticNativeException( f"Unrecognised clause type {type(python_clause)} when unpickling QueryBuilder" From 7722ebbf246d128d708c3fecb0d2d5612fd8412b Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 30 Jan 2025 16:33:50 +0000 Subject: [PATCH 35/36] Add QueryBuilder syntax --- python/arcticdb/version_store/_store.py | 8 +++- python/arcticdb/version_store/library.py | 4 +- .../test_symbol_concatenation.py | 43 +++++++++++++++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index 5c5167b09d..777a610b1c 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -1081,7 +1081,9 @@ def _batch_read_with_join( version_queries = self._get_version_queries(len(symbols), as_ofs, iterate_snapshots_if_tombstoned=False) # Take a copy as _get_read_queries can modify the input argument, which makes reusing the input counter-intuitive per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders) - read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders) + # TODO: Less hacky way for date ranges (and most likely row ranges) as part of ReadRequests to work + force_ranges_to_queries = True + read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders, force_ranges_to_queries) read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False) return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, query_builder.clauses))) @@ -1621,6 +1623,7 @@ def _get_read_queries( row_ranges: Optional[List[Optional[Tuple[int, int]]]], columns: Optional[List[List[str]]], query_builder: Optional[Union[QueryBuilder, List[QueryBuilder]]], + force_ranges_to_queries: bool = False, ): read_queries = [] @@ -1667,6 +1670,9 @@ def _get_read_queries( if query_builder is not None: query = copy.deepcopy(query_builder) if isinstance(query_builder, QueryBuilder) else query_builder[idx] + if query is None and force_ranges_to_queries: + query = QueryBuilder() + read_query = self._get_read_query( date_range=date_range, row_range=row_range, diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index 7b6e4753b6..c3378ea702 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -493,7 +493,7 @@ def collect(self) -> VersionedItemWithJoin: return [] else: lib = self._lazy_dataframes._lib - return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self) + return lib.read_batch_with_join(self._lazy_dataframes._read_requests(), self) def __str__(self) -> str: query_builder_repr = super().__str__() @@ -1674,7 +1674,7 @@ def handle_symbol(s_): iterate_snapshots_if_tombstoned=False, ) - def _read_batch_with_join( + def read_batch_with_join( self, read_requests: List[ReadRequest], query_builder: Optional[QueryBuilder] = None, diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py index 6fed202941..7ec3954f00 100644 --- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py +++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py @@ -191,6 +191,49 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p assert_frame_equal(expected, received) +def test_symbol_concat_querybuilder_syntax(lmdb_library): + lib = lmdb_library + df_0 = pd.DataFrame( + { + "col1": np.arange(3, dtype=np.int64), + "col2": np.arange(100, 103, dtype=np.int64), + "col3": np.arange(1000, 1003, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3), + ) + df_1 = pd.DataFrame( + { + "col1": np.arange(4, dtype=np.int64), + "col2": np.arange(200, 204, dtype=np.int64), + "col3": np.arange(2000, 2004, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4), + ) + df_2 = pd.DataFrame( + { + "col1": np.arange(5, dtype=np.int64), + "col2": np.arange(300, 305, dtype=np.int64), + "col3": np.arange(3000, 3005, dtype=np.int64), + }, + index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5), + ) + lib.write("sym0", df_0) + lib.write("sym1", df_1) + lib.write("sym2", df_2) + + read_request_0 = ReadRequest("sym0") + qb1 = QueryBuilder().date_range((pd.Timestamp(pd.Timestamp(3000)), None)) + read_request_1 = ReadRequest("sym1", query_builder=qb1) + read_request_2 = ReadRequest("sym2", date_range=(None, pd.Timestamp(9000))) + + q = QueryBuilder().concat().resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + received = lib.read_batch_with_join([read_request_0, read_request_1, read_request_2], query_builder=q).data + + received = received.reindex(columns=sorted(received.columns)) + expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"}) + assert_frame_equal(expected, received) + + @pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]]) def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index): lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2)) From 831b36443bf468ffa069911509dd9121a7b0b4ab Mon Sep 17 00:00:00 2001 From: Alex Owens Date: Thu, 30 Jan 2025 17:03:43 +0000 Subject: [PATCH 36/36] Unpin threadpool core counts --- cpp/arcticdb/async/task_scheduler.hpp | 6 +++--- cpp/arcticdb/processing/clause.hpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/async/task_scheduler.hpp b/cpp/arcticdb/async/task_scheduler.hpp index 38c86d6a66..087bd9975e 100644 --- a/cpp/arcticdb/async/task_scheduler.hpp +++ b/cpp/arcticdb/async/task_scheduler.hpp @@ -179,10 +179,10 @@ class TaskScheduler { using CPUSchedulerType = folly::FutureExecutor; using IOSchedulerType = folly::FutureExecutor; - explicit TaskScheduler(ARCTICDB_UNUSED const std::optional& cpu_thread_count = std::nullopt, ARCTICDB_UNUSED const std::optional& io_thread_count = std::nullopt) : + explicit TaskScheduler(const std::optional& cpu_thread_count = std::nullopt, const std::optional& io_thread_count = std::nullopt) : cgroup_folder_("/sys/fs/cgroup"), - cpu_thread_count_(1), - io_thread_count_(1), + cpu_thread_count_(cpu_thread_count ? *cpu_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumCPUThreads", get_default_num_cpus(cgroup_folder_))), + io_thread_count_(io_thread_count ? *io_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumIOThreads", (int) (cpu_thread_count_ * 1.5))), cpu_exec_(cpu_thread_count_, std::make_shared("CPUPool")) , io_exec_(io_thread_count_, std::make_shared("IOPool")){ util::check(cpu_thread_count_ > 0 && io_thread_count_ > 0, "Zero IO or CPU threads: {} {}", io_thread_count_, cpu_thread_count_); diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp index 004cbedc0f..51f69b2640 100644 --- a/cpp/arcticdb/processing/clause.hpp +++ b/cpp/arcticdb/processing/clause.hpp @@ -679,6 +679,8 @@ struct ConcatClause { ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause) [[nodiscard]] std::vector> structure_for_processing(ARCTICDB_UNUSED std::vector& ranges_and_keys) { + // TODO: This isn't a long term requirement, for simplicity at the moment we force all input symbols through their own + // processing pipeline right now, even if they have no clauses internal::raise("ConcatClause should never be first in the pipeline"); }