From e4d94e3c100a243c82091d1465a9d22448e1e448 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 20 Jan 2025 11:56:46 +0000
Subject: [PATCH 01/36] Minimal class and methods defined

---
 cpp/arcticdb/processing/clause.cpp            | 24 ++++++++++++++
 cpp/arcticdb/processing/clause.hpp            | 27 +++++++++++++++
 cpp/arcticdb/version/python_bindings.cpp      |  4 +++
 python/arcticdb/version_store/processing.py   | 12 +++++++
 .../test_symbol_concatenation.py              | 33 +++++++++++++++++++
 5 files changed, 100 insertions(+)
 create mode 100644 python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index a69d9a671a..c1c3f708a2 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1217,4 +1217,28 @@ std::string DateRangeClause::to_string() const {
     return fmt::format("DATE RANGE {} - {}", start_, end_);
 }
 
+ConcatClause::ConcatClause() {
+
+}
+
+std::vector<std::vector<size_t>> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector<RangesAndKey>& ranges_and_keys) {
+    return {};
+}
+
+std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector<std::vector<EntityId>>&& entity_ids_vec) {
+    return {};
+}
+
+std::vector<EntityId> ConcatClause::process(ARCTICDB_UNUSED std::vector<EntityId>&& entity_ids) const {
+    return {};
+}
+
+void ConcatClause::set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) {
+
+}
+
+std::string ConcatClause::to_string() const {
+    return "CONCAT";
+}
+
 }
diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp
index 134780d8db..3a91557700 100644
--- a/cpp/arcticdb/processing/clause.hpp
+++ b/cpp/arcticdb/processing/clause.hpp
@@ -669,4 +669,31 @@ struct DateRangeClause {
     [[nodiscard]] std::string to_string() const;
 };
 
+struct ConcatClause {
+    ClauseInfo clause_info_;
+    std::shared_ptr<ComponentManager> component_manager_;
+
+    ConcatClause();
+
+    ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause)
+
+    [[nodiscard]] std::vector<std::vector<size_t>> structure_for_processing(std::vector<RangesAndKey>& ranges_and_keys);
+
+    [[nodiscard]] std::vector<std::vector<EntityId>> structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec);
+
+    [[nodiscard]] std::vector<EntityId> process(std::vector<EntityId>&& entity_ids) const;
+
+    [[nodiscard]] const ClauseInfo& clause_info() const {
+        return clause_info_;
+    }
+
+    void set_processing_config(const ProcessingConfig& processing_config);
+
+    void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
+        component_manager_ = component_manager;
+    }
+
+    [[nodiscard]] std::string to_string() const;
+};
+
 }//namespace arcticdb
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index b839159843..9a4451a5e8 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -402,6 +402,10 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
             .def_property_readonly("end", &DateRangeClause::end)
             .def("__str__", &DateRangeClause::to_string);
 
+    py::class_<ConcatClause, std::shared_ptr<ConcatClause>>(version, "ConcatClause")
+            .def(py::init<>())
+            .def("__str__", &ConcatClause::to_string);
+
     py::class_<ReadQuery, std::shared_ptr<ReadQuery>>(version, "PythonVersionStoreReadQuery")
             .def(py::init())
             .def_readwrite("columns",&ReadQuery::columns)
diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py
index f58944376f..6fc6527af1 100644
--- a/python/arcticdb/version_store/processing.py
+++ b/python/arcticdb/version_store/processing.py
@@ -33,6 +33,7 @@
 from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary
 from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause
 from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause
+from arcticdb_ext.version_store import ConcatClause as _ConcatClause
 from arcticdb_ext.version_store import RowRangeType as _RowRangeType
 from arcticdb_ext.version_store import ExpressionName as _ExpressionName
 from arcticdb_ext.version_store import ColumnName as _ColumnName
@@ -311,6 +312,7 @@ class PythonRowRangeClause(NamedTuple):
     start: int = None
     end: int = None
 
+
 # Would be cleaner if all Python*Clause classes were dataclasses, but this is used for pickling, so hard to change now
 @dataclass
 class PythonResampleClause:
@@ -323,6 +325,11 @@ class PythonResampleClause:
     origin: Union[str, pd.Timestamp] = "epoch"
 
 
+@dataclass
+class PythonConcatClause:
+    pass
+
+
 class QueryBuilder:
     """
     Build a query to process read results with. Syntax is designed to be similar to Pandas:
@@ -904,6 +911,11 @@ def date_range(self, date_range: DateRangeInput):
         self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)]
         return self
 
+    def concat(self):
+        self.clauses = self.clauses + [_ConcatClause()]
+        self._python_clauses = self._python_clauses + [PythonConcatClause()]
+        return self
+
     def __eq__(self, right):
         if not isinstance(right, QueryBuilder):
             return False
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
new file mode 100644
index 0000000000..0273b2b215
--- /dev/null
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2025 Man Group Operations Limited
+
+Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+
+As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+"""
+import numpy as np
+import pandas as pd
+import pytest
+
+from arcticdb import col, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest
+from arcticdb.util.test import assert_frame_equal
+
+
+pytestmark = pytest.mark.pipeline
+
+
+def test_symbol_concat_basic(lmdb_library):
+    lib = lmdb_library
+    df_1 = pd.DataFrame({"col1": np.arange(2, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=2))
+    df_2 = pd.DataFrame({"col1": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-02-01", periods=3))
+    df_3 = pd.DataFrame({"col1": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-03-01", periods=4))
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+    lib.write("sym3", df_3)
+
+    q = QueryBuilder()
+    q = q.concat()
+    received = lib.read_batch(["sym2", "sym3", "sym1"], query_builder=q).data
+    expected = pd.concat([df_1, df_2, df_3])
+    assert_frame_equal(expected, received.data)
+

From cc896941f994e1ef99b510352a0f783a41875fce Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 20 Jan 2025 15:19:53 +0000
Subject: [PATCH 02/36] Remove C++ changes

---
 cpp/arcticdb/processing/clause.cpp       | 24 ---------------------
 cpp/arcticdb/processing/clause.hpp       | 27 ------------------------
 cpp/arcticdb/version/python_bindings.cpp |  4 ----
 3 files changed, 55 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index c1c3f708a2..a69d9a671a 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1217,28 +1217,4 @@ std::string DateRangeClause::to_string() const {
     return fmt::format("DATE RANGE {} - {}", start_, end_);
 }
 
-ConcatClause::ConcatClause() {
-
-}
-
-std::vector<std::vector<size_t>> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector<RangesAndKey>& ranges_and_keys) {
-    return {};
-}
-
-std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(ARCTICDB_UNUSED std::vector<std::vector<EntityId>>&& entity_ids_vec) {
-    return {};
-}
-
-std::vector<EntityId> ConcatClause::process(ARCTICDB_UNUSED std::vector<EntityId>&& entity_ids) const {
-    return {};
-}
-
-void ConcatClause::set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) {
-
-}
-
-std::string ConcatClause::to_string() const {
-    return "CONCAT";
-}
-
 }
diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp
index 3a91557700..134780d8db 100644
--- a/cpp/arcticdb/processing/clause.hpp
+++ b/cpp/arcticdb/processing/clause.hpp
@@ -669,31 +669,4 @@ struct DateRangeClause {
     [[nodiscard]] std::string to_string() const;
 };
 
-struct ConcatClause {
-    ClauseInfo clause_info_;
-    std::shared_ptr<ComponentManager> component_manager_;
-
-    ConcatClause();
-
-    ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause)
-
-    [[nodiscard]] std::vector<std::vector<size_t>> structure_for_processing(std::vector<RangesAndKey>& ranges_and_keys);
-
-    [[nodiscard]] std::vector<std::vector<EntityId>> structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec);
-
-    [[nodiscard]] std::vector<EntityId> process(std::vector<EntityId>&& entity_ids) const;
-
-    [[nodiscard]] const ClauseInfo& clause_info() const {
-        return clause_info_;
-    }
-
-    void set_processing_config(const ProcessingConfig& processing_config);
-
-    void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
-        component_manager_ = component_manager;
-    }
-
-    [[nodiscard]] std::string to_string() const;
-};
-
 }//namespace arcticdb
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index 9a4451a5e8..b839159843 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -402,10 +402,6 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
             .def_property_readonly("end", &DateRangeClause::end)
             .def("__str__", &DateRangeClause::to_string);
 
-    py::class_<ConcatClause, std::shared_ptr<ConcatClause>>(version, "ConcatClause")
-            .def(py::init<>())
-            .def("__str__", &ConcatClause::to_string);
-
     py::class_<ReadQuery, std::shared_ptr<ReadQuery>>(version, "PythonVersionStoreReadQuery")
             .def(py::init())
             .def_readwrite("columns",&ReadQuery::columns)

From b5336227cddec7b863889b9c32f75c19fdc06fc0 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 20 Jan 2025 15:24:23 +0000
Subject: [PATCH 03/36] Reworked Python layer

---
 python/arcticdb/__init__.py                   |  1 +
 python/arcticdb/version_store/library.py      | 25 +++++++++++++++++++
 python/arcticdb/version_store/processing.py   | 11 --------
 .../test_symbol_concatenation.py              | 25 ++++++++++++-------
 4 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/python/arcticdb/__init__.py b/python/arcticdb/__init__.py
index 2a1f78fc78..da7472be27 100644
--- a/python/arcticdb/__init__.py
+++ b/python/arcticdb/__init__.py
@@ -17,6 +17,7 @@
     col,
     LazyDataFrame,
     LazyDataFrameCollection,
+    concat,
     StagedDataFinalizeMethod,
     WriteMetadataPayload
 )
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index ddb070f09b..8d12db7187 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -478,6 +478,31 @@ def __repr__(self) -> str:
         return self.__str__()
 
 
+class LazyDataFrameAfterJoin(QueryBuilder):
+    def __init__(
+            self,
+            lazy_dataframes: LazyDataFrameCollection,
+    ):
+        super().__init__()
+        self._lazy_dataframes = lazy_dataframes
+
+    def collect(self) -> VersionedItem:
+        pass
+
+    def __str__(self) -> str:
+        query_builder_repr = super().__str__()
+        return f"LazyDataFrameAfterJoin(Concat({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})"
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin:
+    if not isinstance(lazy_dataframes, LazyDataFrameCollection):
+        lazy_dataframes = LazyDataFrameCollection(lazy_dataframes)
+    return LazyDataFrameAfterJoin(lazy_dataframes)
+
+
 def col(name: str) -> ExpressionNode:
     """
     Placeholder for referencing columns by name in lazy dataframe operations before the underlying object has been
diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py
index 6fc6527af1..789d4f7378 100644
--- a/python/arcticdb/version_store/processing.py
+++ b/python/arcticdb/version_store/processing.py
@@ -33,7 +33,6 @@
 from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary
 from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause
 from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause
-from arcticdb_ext.version_store import ConcatClause as _ConcatClause
 from arcticdb_ext.version_store import RowRangeType as _RowRangeType
 from arcticdb_ext.version_store import ExpressionName as _ExpressionName
 from arcticdb_ext.version_store import ColumnName as _ColumnName
@@ -325,11 +324,6 @@ class PythonResampleClause:
     origin: Union[str, pd.Timestamp] = "epoch"
 
 
-@dataclass
-class PythonConcatClause:
-    pass
-
-
 class QueryBuilder:
     """
     Build a query to process read results with. Syntax is designed to be similar to Pandas:
@@ -911,11 +905,6 @@ def date_range(self, date_range: DateRangeInput):
         self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)]
         return self
 
-    def concat(self):
-        self.clauses = self.clauses + [_ConcatClause()]
-        self._python_clauses = self._python_clauses + [PythonConcatClause()]
-        return self
-
     def __eq__(self, right):
         if not isinstance(right, QueryBuilder):
             return False
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 0273b2b215..4c6de62042 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import pytest
 
-from arcticdb import col, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest
+from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest
 from arcticdb.util.test import assert_frame_equal
 
 
@@ -18,16 +18,23 @@
 
 def test_symbol_concat_basic(lmdb_library):
     lib = lmdb_library
-    df_1 = pd.DataFrame({"col1": np.arange(2, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=2))
-    df_2 = pd.DataFrame({"col1": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-02-01", periods=3))
-    df_3 = pd.DataFrame({"col1": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-03-01", periods=4))
+    df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=3))
+    df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-01-03", periods=4))
+    df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range("2025-03-07", periods=5))
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
     lib.write("sym3", df_3)
 
-    q = QueryBuilder()
-    q = q.concat()
-    received = lib.read_batch(["sym2", "sym3", "sym1"], query_builder=q).data
-    expected = pd.concat([df_1, df_2, df_3])
-    assert_frame_equal(expected, received.data)
+    lazy_df_1 = lib.read("sym1", lazy=True)
+    lazy_df_2 = lib.read("sym2", lazy=True)
+    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None))
+    lazy_df_3 = lib.read("sym3", lazy=True)
+
+    lazy_df = concat([lazy_df_2, lazy_df_3, lazy_df_1])
+
+    lazy_df.resample("2D").agg({"col": "sum"})
+
+    received = lazy_df.collect().data
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"})
+    assert_frame_equal(expected, received)
 

From 28eda0882f46d8f9f23b2b3645574cf3147fc698 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 20 Jan 2025 15:25:16 +0000
Subject: [PATCH 04/36] Revert whitespace change

---
 python/arcticdb/version_store/processing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py
index 789d4f7378..f58944376f 100644
--- a/python/arcticdb/version_store/processing.py
+++ b/python/arcticdb/version_store/processing.py
@@ -311,7 +311,6 @@ class PythonRowRangeClause(NamedTuple):
     start: int = None
     end: int = None
 
-
 # Would be cleaner if all Python*Clause classes were dataclasses, but this is used for pickling, so hard to change now
 @dataclass
 class PythonResampleClause:

From e71b5eb4ef76fc6bb74c2898ead28a1afb89b4a7 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 20 Jan 2025 16:09:20 +0000
Subject: [PATCH 05/36] Sketch out API for reading multiple symbols with a joi

---
 python/arcticdb/version_store/_store.py  | 13 ++++++++
 python/arcticdb/version_store/library.py | 42 ++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index 4a45c24fb7..32a32360a5 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -1065,6 +1065,19 @@ def _batch_read_to_versioned_items(
                 versioned_items.append(vitem)
         return versioned_items
 
+    def _batch_read_with_join(
+            self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, join, query_builder
+    ):
+        implement_read_index = True
+        if columns:
+            columns = [self._resolve_empty_columns(c, implement_read_index) for c in columns]
+        version_queries = self._get_version_queries(len(symbols), as_ofs, iterate_snapshots_if_tombstoned=False)
+        # Take a copy as _get_read_queries can modify the input argument, which makes reusing the input counter-intuitive
+        per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders)
+        read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders)
+        read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False)
+        return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, join, query_builder.clauses)))
+
     def batch_read_metadata(
         self, symbols: List[str], as_ofs: Optional[List[VersionQueryInput]] = None, **kwargs
     ) -> Dict[str, VersionedItem]:
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index 8d12db7187..a2e79ccf7f 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -482,12 +482,18 @@ class LazyDataFrameAfterJoin(QueryBuilder):
     def __init__(
             self,
             lazy_dataframes: LazyDataFrameCollection,
+            join: Any,
     ):
         super().__init__()
         self._lazy_dataframes = lazy_dataframes
+        self._join = join
 
     def collect(self) -> VersionedItem:
-        pass
+        if not len(self._lazy_dataframes._lazy_dataframes):
+            return []
+        else:
+            lib = self._lazy_dataframes._lib
+            return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self._join, self)
 
     def __str__(self) -> str:
         query_builder_repr = super().__str__()
@@ -500,7 +506,7 @@ def __repr__(self) -> str:
 def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin:
     if not isinstance(lazy_dataframes, LazyDataFrameCollection):
         lazy_dataframes = LazyDataFrameCollection(lazy_dataframes)
-    return LazyDataFrameAfterJoin(lazy_dataframes)
+    return LazyDataFrameAfterJoin(lazy_dataframes, "concat")
 
 
 def col(name: str) -> ExpressionNode:
@@ -1668,6 +1674,38 @@ def handle_symbol(s_):
                 iterate_snapshots_if_tombstoned=False,
             )
 
+    def _read_batch_with_join(
+            self,
+            read_requests: List[ReadRequest],
+            join: Any,
+            query_builder: Optional[QueryBuilder] = None,
+    ) -> VersionedItem:
+        symbol_strings = []
+        as_ofs = []
+        date_ranges = []
+        row_ranges = []
+        columns = []
+        per_symbol_query_builders = []
+
+        for r in read_requests:
+            symbol_strings.append(r.symbol)
+            as_ofs.append(r.as_of)
+            date_ranges.append(r.date_range)
+            row_ranges.append(r.row_range)
+            columns.append(r.columns)
+            per_symbol_query_builders.append(r.query_builder)
+
+        return self._nvs._batch_read_with_join(
+            symbol_strings,
+            as_ofs,
+            date_ranges,
+            row_ranges,
+            columns,
+            per_symbol_query_builders,
+            join,
+            query_builder,
+        )
+
     def read_metadata(self, symbol: str, as_of: Optional[AsOf] = None) -> VersionedItem:
         """
         Return the metadata saved for a symbol.  This method is faster than read as it only loads the metadata, not the

From 08126c3829b701ece24edd2eb263a4d8f0fa852b Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Tue, 21 Jan 2025 16:37:08 +0000
Subject: [PATCH 06/36] Wiring down to local_version_engine.cpp

---
 cpp/arcticdb/processing/query_planner.hpp       |  1 +
 cpp/arcticdb/version/local_versioned_engine.cpp | 11 +++++++++++
 cpp/arcticdb/version/local_versioned_engine.hpp | 10 ++++++++++
 cpp/arcticdb/version/python_bindings.cpp        | 13 +++++++++++++
 cpp/arcticdb/version/version_core.hpp           | 14 ++++++++++++++
 cpp/arcticdb/version/version_store_api.cpp      | 13 +++++++++++++
 cpp/arcticdb/version/version_store_api.hpp      |  9 +++++++++
 7 files changed, 71 insertions(+)

diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp
index 99106654e1..925ce42133 100644
--- a/cpp/arcticdb/processing/query_planner.hpp
+++ b/cpp/arcticdb/processing/query_planner.hpp
@@ -14,6 +14,7 @@
 
 namespace arcticdb {
 
+// TODO: Move these somewhere else, they are generally useful
 using GroupByClause = PartitionClause<arcticdb::grouping::HashingGroupers, arcticdb::grouping::ModuloBucketizer>;
 using ClauseVariant = std::variant<std::shared_ptr<FilterClause>,
         std::shared_ptr<ProjectClause>,
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 99c1a245af..6ef2121c36 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1151,6 +1151,17 @@ std::vector<std::variant<ReadVersionOutput, DataError>> LocalVersionedEngine::ba
     return read_versions_or_errors;
 }
 
+MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
+        ARCTICDB_UNUSED const std::vector<StreamId>& stream_ids,
+        ARCTICDB_UNUSED const std::vector<VersionQuery>& version_queries,
+        ARCTICDB_UNUSED std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+        ARCTICDB_UNUSED const ReadOptions& read_options,
+        ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+        ARCTICDB_UNUSED const std::vector<ClauseVariant>& post_join_clauses,
+        ARCTICDB_UNUSED std::any& handler_data) {
+    return {};
+}
+
 void LocalVersionedEngine::write_version_and_prune_previous(
         bool prune_previous_versions,
         const AtomKey& new_version,
diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp
index da140ceec4..7dd892ee41 100644
--- a/cpp/arcticdb/version/local_versioned_engine.hpp
+++ b/cpp/arcticdb/version/local_versioned_engine.hpp
@@ -17,6 +17,7 @@
 #include <arcticdb/entity/versioned_item.hpp>
 #include <arcticdb/pipeline/query.hpp>
 #include <arcticdb/pipeline/input_tensor_frame.hpp>
+#include <arcticdb/processing/query_planner.hpp>
 #include <arcticdb/version/version_core.hpp>
 #include <arcticdb/version/versioned_engine.hpp>
 #include <arcticdb/entity/descriptor_item.hpp>
@@ -286,6 +287,15 @@ class LocalVersionedEngine : public VersionedEngine {
         const ReadOptions& read_options,
         std::any& handler_data);
 
+    MultiSymbolReadOutput batch_read_with_join_internal(
+            const std::vector<StreamId>& stream_ids,
+            const std::vector<VersionQuery>& version_queries,
+            std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+            const ReadOptions& read_options,
+            const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+            const std::vector<ClauseVariant>& post_join_clauses,
+            std::any& handler_data);
+
     std::vector<std::variant<DescriptorItem, DataError>> batch_read_descriptor_internal(
             const std::vector<StreamId>& stream_ids,
             const std::vector<VersionQuery>& version_queries,
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index b839159843..6d0ba15015 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -767,6 +767,19 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  return python_util::adapt_read_dfs(v.batch_read(stream_ids, version_queries, read_queries, read_options, handler_data));
              },
              py::call_guard<SingleThreadMutexHolder>(), "Read a dataframe from the store")
+        .def("batch_read_with_join",
+             [&](PythonVersionStore& v,
+                 const std::vector<StreamId> &stream_ids,
+                 const std::vector<VersionQuery>& version_queries,
+                 std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+                 const ReadOptions& read_options,
+                 const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+                 std::vector<ClauseVariant> post_join_clauses
+                 ){
+                 auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
+                 return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data));
+             },
+             py::call_guard<SingleThreadMutexHolder>(), "Read a dataframe from the store")
         .def("batch_read_keys",
              [&](PythonVersionStore& v, std::vector<AtomKey> atom_keys) {
                  return python_util::adapt_read_dfs(frame_to_read_result(v.batch_read_keys(atom_keys)));
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index f9e0ae9294..09e8b49c4f 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -53,6 +53,20 @@ struct ReadVersionOutput {
     FrameAndDescriptor frame_and_descriptor_;
 };
 
+struct MultiSymbolReadOutput {
+    // TODO: delete this ctor
+//    MultiSymbolReadOutput() = delete;
+    MultiSymbolReadOutput() = default;
+    MultiSymbolReadOutput(std::vector<VersionedItem>&& versioned_items, FrameAndDescriptor&& frame_and_descriptor):
+            versioned_items_(std::move(versioned_items)),
+            frame_and_descriptor_(std::move(frame_and_descriptor)) {}
+
+    ARCTICDB_MOVE_ONLY_DEFAULT(MultiSymbolReadOutput)
+
+    std::vector<VersionedItem> versioned_items_;
+    FrameAndDescriptor frame_and_descriptor_;
+};
+
 VersionedItem write_dataframe_impl(
     const std::shared_ptr<Store>& store,
     VersionId version_id,
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index ab8c369371..93d00ee6ef 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -787,6 +787,19 @@ std::vector<std::variant<ReadResult, DataError>> PythonVersionStore::batch_read(
     return res;
 }
 
+ReadResult PythonVersionStore::batch_read_with_join(
+        const std::vector<StreamId>& stream_ids,
+        const std::vector<VersionQuery>& version_queries,
+        std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+        const ReadOptions& read_options,
+        const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+        const std::vector<ClauseVariant>& post_join_clauses,
+        std::any& handler_data) {
+    auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data);
+    // TODO: Decide what to do with VersionedItem part of read result
+    return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_));
+}
+
 void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) {
     ARCTICDB_RUNTIME_DEBUG(log::version(), "Command: delete_snapshot");
     auto opt_snapshot =  get_snapshot(store(), snap_name);
diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp
index e8d3057e34..2f132c18bb 100644
--- a/cpp/arcticdb/version/version_store_api.hpp
+++ b/cpp/arcticdb/version/version_store_api.hpp
@@ -300,6 +300,15 @@ class PythonVersionStore : public LocalVersionedEngine {
         const ReadOptions& read_options,
         std::any& handler_data);
 
+    ReadResult batch_read_with_join(
+            const std::vector<StreamId>& stream_ids,
+            const std::vector<VersionQuery>& version_queries,
+            std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+            const ReadOptions& read_options,
+            const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+            const std::vector<ClauseVariant>& post_join_clauses,
+            std::any& handler_data);
+
     std::vector<std::variant<std::pair<VersionedItem, py::object>, DataError>> batch_read_metadata(
         const std::vector<StreamId>& stream_ids,
         const std::vector<VersionQuery>& version_queries,

From d9107e5722b56f8e6bc51399161f5ae35f065d1d Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Tue, 21 Jan 2025 16:57:09 +0000
Subject: [PATCH 07/36] Use Poly Clause from VersionStoreApi level downwards

---
 cpp/arcticdb/processing/query_planner.hpp       |  1 -
 cpp/arcticdb/version/local_versioned_engine.cpp |  2 +-
 cpp/arcticdb/version/local_versioned_engine.hpp |  3 +--
 cpp/arcticdb/version/python_bindings.cpp        | 11 ++++++++++-
 cpp/arcticdb/version/version_store_api.cpp      |  4 ++--
 cpp/arcticdb/version/version_store_api.hpp      |  2 +-
 6 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp
index 925ce42133..99106654e1 100644
--- a/cpp/arcticdb/processing/query_planner.hpp
+++ b/cpp/arcticdb/processing/query_planner.hpp
@@ -14,7 +14,6 @@
 
 namespace arcticdb {
 
-// TODO: Move these somewhere else, they are generally useful
 using GroupByClause = PartitionClause<arcticdb::grouping::HashingGroupers, arcticdb::grouping::ModuloBucketizer>;
 using ClauseVariant = std::variant<std::shared_ptr<FilterClause>,
         std::shared_ptr<ProjectClause>,
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 6ef2121c36..f2d592c6d6 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1157,7 +1157,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         ARCTICDB_UNUSED std::vector<std::shared_ptr<ReadQuery>>& read_queries,
         ARCTICDB_UNUSED const ReadOptions& read_options,
         ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-        ARCTICDB_UNUSED const std::vector<ClauseVariant>& post_join_clauses,
+        ARCTICDB_UNUSED std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
         ARCTICDB_UNUSED std::any& handler_data) {
     return {};
 }
diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp
index 7dd892ee41..fbc01ff899 100644
--- a/cpp/arcticdb/version/local_versioned_engine.hpp
+++ b/cpp/arcticdb/version/local_versioned_engine.hpp
@@ -17,7 +17,6 @@
 #include <arcticdb/entity/versioned_item.hpp>
 #include <arcticdb/pipeline/query.hpp>
 #include <arcticdb/pipeline/input_tensor_frame.hpp>
-#include <arcticdb/processing/query_planner.hpp>
 #include <arcticdb/version/version_core.hpp>
 #include <arcticdb/version/versioned_engine.hpp>
 #include <arcticdb/entity/descriptor_item.hpp>
@@ -293,7 +292,7 @@ class LocalVersionedEngine : public VersionedEngine {
             std::vector<std::shared_ptr<ReadQuery>>& read_queries,
             const ReadOptions& read_options,
             const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-            const std::vector<ClauseVariant>& post_join_clauses,
+            std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
             std::any& handler_data);
 
     std::vector<std::variant<DescriptorItem, DataError>> batch_read_descriptor_internal(
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index 6d0ba15015..4742592718 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -777,7 +777,16 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  std::vector<ClauseVariant> post_join_clauses
                  ){
                  auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
-                 return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data));
+                 // TODO: Remove duplication with ReadQuery interface
+                 post_join_clauses = plan_query(std::move(post_join_clauses));
+                 std::vector<std::shared_ptr<Clause>> _clauses;
+                 for (auto&& clause: post_join_clauses) {
+                     util::variant_match(
+                             clause,
+                             [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
+                     );
+                 }
+                 return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, std::move(_clauses), handler_data));
              },
              py::call_guard<SingleThreadMutexHolder>(), "Read a dataframe from the store")
         .def("batch_read_keys",
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index 93d00ee6ef..2be57afaf0 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -793,9 +793,9 @@ ReadResult PythonVersionStore::batch_read_with_join(
         std::vector<std::shared_ptr<ReadQuery>>& read_queries,
         const ReadOptions& read_options,
         const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-        const std::vector<ClauseVariant>& post_join_clauses,
+        std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
         std::any& handler_data) {
-    auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, post_join_clauses, handler_data);
+    auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, std::move(post_join_clauses), handler_data);
     // TODO: Decide what to do with VersionedItem part of read result
     return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_));
 }
diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp
index 2f132c18bb..2a834f6eda 100644
--- a/cpp/arcticdb/version/version_store_api.hpp
+++ b/cpp/arcticdb/version/version_store_api.hpp
@@ -306,7 +306,7 @@ class PythonVersionStore : public LocalVersionedEngine {
             std::vector<std::shared_ptr<ReadQuery>>& read_queries,
             const ReadOptions& read_options,
             const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-            const std::vector<ClauseVariant>& post_join_clauses,
+            std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
             std::any& handler_data);
 
     std::vector<std::variant<std::pair<VersionedItem, py::object>, DataError>> batch_read_metadata(

From 634247a983de0ca89588c2c9e6db0a9149f224cf Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Tue, 21 Jan 2025 18:23:40 +0000
Subject: [PATCH 08/36] Batch read and get set of entity IDs on a single
 ComponentManager for all individual symbol queries (untested)

---
 cpp/arcticdb/async/task_scheduler.hpp         |  6 +-
 .../version/local_versioned_engine.cpp        | 30 +++++++
 cpp/arcticdb/version/version_core.cpp         | 85 +++++++++++++++++++
 cpp/arcticdb/version/version_core.hpp         |  8 ++
 4 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/cpp/arcticdb/async/task_scheduler.hpp b/cpp/arcticdb/async/task_scheduler.hpp
index 087bd9975e..38c86d6a66 100644
--- a/cpp/arcticdb/async/task_scheduler.hpp
+++ b/cpp/arcticdb/async/task_scheduler.hpp
@@ -179,10 +179,10 @@ class TaskScheduler {
     using CPUSchedulerType = folly::FutureExecutor<folly::CPUThreadPoolExecutor>;
     using IOSchedulerType = folly::FutureExecutor<folly::IOThreadPoolExecutor>;
 
-     explicit TaskScheduler(const std::optional<size_t>& cpu_thread_count = std::nullopt, const std::optional<size_t>& io_thread_count = std::nullopt) :
+     explicit TaskScheduler(ARCTICDB_UNUSED const std::optional<size_t>& cpu_thread_count = std::nullopt, ARCTICDB_UNUSED const std::optional<size_t>& io_thread_count = std::nullopt) :
         cgroup_folder_("/sys/fs/cgroup"),
-        cpu_thread_count_(cpu_thread_count ? *cpu_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumCPUThreads", get_default_num_cpus(cgroup_folder_))),
-        io_thread_count_(io_thread_count ? *io_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumIOThreads", (int) (cpu_thread_count_ * 1.5))),
+        cpu_thread_count_(1),
+        io_thread_count_(1),
         cpu_exec_(cpu_thread_count_, std::make_shared<InstrumentedNamedFactory>("CPUPool")) ,
         io_exec_(io_thread_count_,  std::make_shared<InstrumentedNamedFactory>("IOPool")){
         util::check(cpu_thread_count_ > 0 && io_thread_count_ > 0, "Zero IO or CPU threads: {} {}", io_thread_count_, cpu_thread_count_);
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index f2d592c6d6..8f74214ea9 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1159,6 +1159,36 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause
         ARCTICDB_UNUSED std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
         ARCTICDB_UNUSED std::any& handler_data) {
+    py::gil_scoped_release release_gil;
+    auto opt_index_key_futs = batch_get_versions_async(store(), version_map(), stream_ids, version_queries);
+    std::vector<folly::Future<std::vector<EntityId>>> symbol_entities_futs;
+    symbol_entities_futs.reserve(opt_index_key_futs.size());
+    auto component_manager = std::make_shared<ComponentManager>();
+    for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) {
+        symbol_entities_futs.emplace_back(
+                std::move(opt_index_key_futs[idx]).thenValue([store = store(),
+                                                                     idx,
+                                                                     &stream_ids,
+                                                                     &version_queries,
+                                                                     read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
+                                                                     &read_options,
+                                                                     &component_manager](auto&& opt_index_key) {
+                    std::variant<VersionedItem, StreamId> version_info;
+                    if (opt_index_key.has_value()) {
+                        version_info = VersionedItem(std::move(*opt_index_key));
+                    } else {
+                        if (opt_false(read_options.incompletes_)) {
+                            log::version().warn("No index: Key not found for {}, will attempt to use incomplete segments.", stream_ids[idx]);
+                            version_info = stream_ids[idx];
+                        } else {
+                            missing_data::raise<ErrorCode::E_NO_SUCH_VERSION>(
+                                    "batch_read_internal: version matching query '{}' not found for symbol '{}'", version_queries[idx], stream_ids[idx]);
+                        }
+                    }
+                    return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager);
+                })
+        );
+    }
     return {};
 }
 
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index e81118a16b..aa55c2485e 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -922,6 +922,43 @@ folly::Future<std::vector<SliceAndKey>> read_and_process(
     });
 }
 
+folly::Future<std::vector<EntityId>> read_and_process_2(
+    const std::shared_ptr<Store>& store,
+    const std::shared_ptr<PipelineContext>& pipeline_context,
+    const std::shared_ptr<ReadQuery>& read_query,
+    const ReadOptions& read_options,
+    std::shared_ptr<ComponentManager> component_manager
+    ) {
+    ProcessingConfig processing_config{opt_false(read_options.dynamic_schema_), pipeline_context->rows_};
+    for (auto& clause: read_query->clauses_) {
+        clause->set_processing_config(processing_config);
+        clause->set_component_manager(component_manager);
+    }
+
+    auto ranges_and_keys = generate_ranges_and_keys(*pipeline_context);
+
+    // Each element of the vector corresponds to one processing unit containing the list of indexes in ranges_and_keys required for that processing unit
+    // i.e. if the first processing unit needs ranges_and_keys[0] and ranges_and_keys[1], and the second needs ranges_and_keys[2] and ranges_and_keys[3]
+    // then the structure will be {{0, 1}, {2, 3}}
+    std::vector<std::vector<size_t>> processing_unit_indexes;
+    if (!read_query->clauses_.empty()) {
+        processing_unit_indexes = read_query->clauses_[0]->structure_for_processing(
+                ranges_and_keys);
+    } else {
+        processing_unit_indexes = structure_by_row_slice(ranges_and_keys);
+    }
+
+    // Start reading as early as possible
+    auto segment_and_slice_futures = generate_segment_and_slice_futures(store, pipeline_context, processing_config, std::move(ranges_and_keys));
+
+    return schedule_clause_processing(
+        component_manager,
+        std::move(segment_and_slice_futures),
+        std::move(processing_unit_indexes),
+        std::make_shared<std::vector<std::shared_ptr<Clause>>>(read_query->clauses_))
+    .via(&async::cpu_executor());
+}
+
 void add_index_columns_to_query(const ReadQuery& read_query, const TimeseriesDescriptor& desc) {
     if (read_query.columns.has_value()) {
         auto index_columns = stream::get_index_columns_from_descriptor(desc);
@@ -1535,6 +1572,16 @@ folly::Future<SegmentInMemory> do_direct_read_or_process(
     }
 }
 
+folly::Future<std::vector<EntityId>> do_process(
+        const std::shared_ptr<Store>& store,
+        const std::shared_ptr<ReadQuery>& read_query,
+        const ReadOptions& read_options,
+        const std::shared_ptr<PipelineContext>& pipeline_context,
+        std::shared_ptr<ComponentManager> component_manager) {
+    util::check_rte(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data");
+    return read_and_process_2(store, pipeline_context, read_query, read_options, component_manager);
+}
+
 VersionedItem collate_and_write(
     const std::shared_ptr<Store>& store,
     const std::shared_ptr<PipelineContext>& pipeline_context,
@@ -2027,6 +2074,44 @@ folly::Future<ReadVersionOutput> read_frame_for_version(
         });
     });
 }
+
+folly::Future<std::vector<EntityId>> read_entity_ids_for_version(
+        const std::shared_ptr<Store>& store,
+        const std::variant<VersionedItem, StreamId>& version_info,
+        const std::shared_ptr<ReadQuery>& read_query ,
+        const ReadOptions& read_options,
+        std::shared_ptr<ComponentManager> component_manager) {
+    using namespace arcticdb::pipelines;
+    auto pipeline_context = std::make_shared<PipelineContext>();
+    VersionedItem res_versioned_item;
+
+    if(std::holds_alternative<StreamId>(version_info)) {
+        pipeline_context->stream_id_ = std::get<StreamId>(version_info);
+    } else {
+        pipeline_context->stream_id_ = std::get<VersionedItem>(version_info).key_.id();
+        read_indexed_keys_to_pipeline(store, pipeline_context, std::get<VersionedItem>(version_info), *read_query, read_options);
+    }
+
+    user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!pipeline_context->multi_key_, "Multi-symbol joines not supported with recursively normalized data");
+
+    if(opt_false(read_options.incompletes_)) {
+        util::check(std::holds_alternative<IndexRange>(read_query->row_filter), "Streaming read requires date range filter");
+        const auto& query_range = std::get<IndexRange>(read_query->row_filter);
+        const auto existing_range = pipeline_context->index_range();
+        if(!existing_range.specified_ || query_range.end_ > existing_range.end_)
+            read_incompletes_to_pipeline(store, pipeline_context, *read_query, read_options, false, false, false,  opt_false(read_options.dynamic_schema_));
+    }
+
+    if(std::holds_alternative<StreamId>(version_info) && !pipeline_context->incompletes_after_) {
+        return std::vector<EntityId>{};
+    }
+
+    modify_descriptor(pipeline_context, read_options);
+    generate_filtered_field_descriptors(pipeline_context, read_query->columns);
+    ARCTICDB_DEBUG(log::version(), "Fetching data to frame");
+
+    return version_store::do_process(store, read_query, read_options, pipeline_context, component_manager);
+}
 } //namespace arcticdb::version_store
 
 namespace arcticdb {
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index 09e8b49c4f..6353213448 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -225,6 +225,14 @@ folly::Future<ReadVersionOutput> read_frame_for_version(
     std::any& handler_data
 );
 
+folly::Future<std::vector<EntityId>> read_entity_ids_for_version(
+        const std::shared_ptr<Store>& store,
+        const std::variant<VersionedItem, StreamId>& version_info,
+        const std::shared_ptr<ReadQuery>& read_query,
+        const ReadOptions& read_options,
+        std::shared_ptr<ComponentManager> component_manager
+);
+
 class DeleteIncompleteKeysOnExit {
 public:
     DeleteIncompleteKeysOnExit(

From 4aafe67fa140c310a561d1f57b0b14500805c35c Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 22 Jan 2025 10:06:41 +0000
Subject: [PATCH 09/36] Fix schedule_clause_processing when there are zero
 clauses

---
 cpp/arcticdb/version/local_versioned_engine.cpp | 1 +
 cpp/arcticdb/version/version_core.cpp           | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 8f74214ea9..78cb56de1b 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1189,6 +1189,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                 })
         );
     }
+    auto symbol_entities = folly::collect(symbol_entities_futs).get();
     return {};
 }
 
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index aa55c2485e..f8473ff22b 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -528,6 +528,9 @@ void add_slice_to_component_manager(
 }
 
 size_t num_scheduling_iterations(const std::vector<std::shared_ptr<Clause>>& clauses) {
+    if (clauses.empty()) {
+        return 0;
+    }
     size_t res = 1UL;
     auto it = std::next(clauses.cbegin());
     while (it != clauses.cend()) {

From ec189c7ad108b99bdf0bbee9b401ade9e08fdd76 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 22 Jan 2025 13:52:28 +0000
Subject: [PATCH 10/36] Produce one SegmentInMemory from output of multiple
 symbol's pipelines

---
 cpp/arcticdb/version/local_versioned_engine.cpp | 16 +++++++++++++++-
 cpp/arcticdb/version/version_core.cpp           |  5 ++---
 cpp/arcticdb/version/version_core.hpp           |  7 +++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 78cb56de1b..62c1435f7f 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1189,7 +1189,21 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                 })
         );
     }
-    auto symbol_entities = folly::collect(symbol_entities_futs).get();
+    ARCTICDB_UNUSED auto segment_in_memory = folly::collect(symbol_entities_futs)
+            .via(&async::cpu_executor())
+            .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
+        return flatten_entities(std::move(entity_id_vectors));
+    }).thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
+        auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
+        // TODO: Handle output descriptors
+        return collect_segments(std::move(proc));
+    }).thenValueInline([store=store(), &handler_data](std::vector<SliceAndKey>&& slice_and_keys) {
+        auto pipeline_context = std::make_shared<PipelineContext>();
+        internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
+        pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor());
+        ReadOptions read_options;
+        return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, read_options, handler_data);
+    }).get();
     return {};
 }
 
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index f8473ff22b..8d6af65072 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -1304,7 +1304,7 @@ void copy_frame_data_to_buffer(
 }
 
 struct CopyToBufferTask : async::BaseTask {
-    SegmentInMemory&& source_segment_;
+    SegmentInMemory source_segment_;
     SegmentInMemory target_segment_;
     FrameSlice frame_slice_;
     DecodePathData shared_data_;
@@ -1362,11 +1362,10 @@ folly::Future<folly::Unit> copy_segments_to_frame(
     DecodePathData shared_data;
     for (auto context_row : folly::enumerate(*pipeline_context)) {
         auto &slice_and_key = context_row->slice_and_key();
-        auto &segment = slice_and_key.segment(store);
 
         copy_tasks.emplace_back(async::submit_cpu_task(
             CopyToBufferTask{
-                std::move(segment),
+                slice_and_key.release_segment(store),
                 frame,
                 context_row->slice_and_key().slice(),
                 shared_data,
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index 6353213448..1e5894f556 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -261,6 +261,13 @@ std::optional<DeleteIncompleteKeysOnExit> get_delete_keys_on_failure(
     const std::shared_ptr<Store>& store,
     const CompactIncompleteOptions& options);
 
+folly::Future<SegmentInMemory> prepare_output_frame(
+        std::vector<SliceAndKey>&& items,
+        const std::shared_ptr<PipelineContext>& pipeline_context,
+        const std::shared_ptr<Store>& store,
+        const ReadOptions& read_options,
+        std::any& handler_data);
+
 } //namespace arcticdb::version_store
 
 namespace arcticdb {

From b5e10a0223a2805bfd7bbe2647b11a756ee09b9b Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 22 Jan 2025 14:49:37 +0000
Subject: [PATCH 11/36] First, most basic, test passing

---
 .../version/local_versioned_engine.cpp        | 41 +++++++++++--------
 cpp/arcticdb/version/version_core.hpp         |  4 +-
 .../test_symbol_concatenation.py              |  6 +--
 3 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 62c1435f7f..88e127003f 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1164,6 +1164,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     std::vector<folly::Future<std::vector<EntityId>>> symbol_entities_futs;
     symbol_entities_futs.reserve(opt_index_key_futs.size());
     auto component_manager = std::make_shared<ComponentManager>();
+    auto pipeline_context = std::make_shared<PipelineContext>();
+    auto norm_meta_mtx = std::make_shared<std::mutex>();
+    DecodePathData shared_data;
     for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) {
         symbol_entities_futs.emplace_back(
                 std::move(opt_index_key_futs[idx]).thenValue([store = store(),
@@ -1172,24 +1175,22 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                                                                      &version_queries,
                                                                      read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
                                                                      &read_options,
-                                                                     &component_manager](auto&& opt_index_key) {
+                                                                     &component_manager,
+                                                                     pipeline_context,
+                                                                     norm_meta_mtx](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
-                    if (opt_index_key.has_value()) {
-                        version_info = VersionedItem(std::move(*opt_index_key));
-                    } else {
-                        if (opt_false(read_options.incompletes_)) {
-                            log::version().warn("No index: Key not found for {}, will attempt to use incomplete segments.", stream_ids[idx]);
-                            version_info = stream_ids[idx];
-                        } else {
-                            missing_data::raise<ErrorCode::E_NO_SUCH_VERSION>(
-                                    "batch_read_internal: version matching query '{}' not found for symbol '{}'", version_queries[idx], stream_ids[idx]);
-                        }
+                    internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
+                    auto index_key_seg = store->read_sync(*opt_index_key).second;
+                    {
+                        std::lock_guard<std::mutex> lock(*norm_meta_mtx);
+                        pipeline_context->norm_meta_ =std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
                     }
+                    version_info = VersionedItem(std::move(*opt_index_key));
                     return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager);
                 })
         );
     }
-    ARCTICDB_UNUSED auto segment_in_memory = folly::collect(symbol_entities_futs)
+    return folly::collect(symbol_entities_futs)
             .via(&async::cpu_executor())
             .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
         return flatten_entities(std::move(entity_id_vectors));
@@ -1197,14 +1198,20 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
         // TODO: Handle output descriptors
         return collect_segments(std::move(proc));
-    }).thenValueInline([store=store(), &handler_data](std::vector<SliceAndKey>&& slice_and_keys) {
-        auto pipeline_context = std::make_shared<PipelineContext>();
+    }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
         internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
         pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor());
-        ReadOptions read_options;
-        return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, read_options, handler_data);
+        return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
+    }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable {
+        return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
+        .thenValue([pipeline_context, frame, shared_data](auto&&) mutable {
+            return MultiSymbolReadOutput{{},
+                                     {frame,
+                                      timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_),
+                                      {},
+                                      shared_data.buffers()}};
+        });
     }).get();
-    return {};
 }
 
 void LocalVersionedEngine::write_version_and_prune_previous(
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index 1e5894f556..8d5aaec183 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -54,9 +54,7 @@ struct ReadVersionOutput {
 };
 
 struct MultiSymbolReadOutput {
-    // TODO: delete this ctor
-//    MultiSymbolReadOutput() = delete;
-    MultiSymbolReadOutput() = default;
+    MultiSymbolReadOutput() = delete;
     MultiSymbolReadOutput(std::vector<VersionedItem>&& versioned_items, FrameAndDescriptor&& frame_and_descriptor):
             versioned_items_(std::move(versioned_items)),
             frame_and_descriptor_(std::move(frame_and_descriptor)) {}
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 4c6de62042..57b541027c 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -30,11 +30,11 @@ def test_symbol_concat_basic(lmdb_library):
     lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None))
     lazy_df_3 = lib.read("sym3", lazy=True)
 
-    lazy_df = concat([lazy_df_2, lazy_df_3, lazy_df_1])
+    lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
-    lazy_df.resample("2D").agg({"col": "sum"})
+    # lazy_df.resample("2D").agg({"col": "sum"})
 
     received = lazy_df.collect().data
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"})
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]) #.resample("2D").agg({"col": "sum"})
     assert_frame_equal(expected, received)
 

From 400facf0ab2bb30711095011aaf32d3330a3e491 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 22 Jan 2025 16:22:55 +0000
Subject: [PATCH 12/36] Refactor schedule_remaining_iterations into own method

---
 .../version/local_versioned_engine.cpp        | 16 +++---
 cpp/arcticdb/version/version_core.cpp         | 54 ++++++++++---------
 .../test_symbol_concatenation.py              |  4 +-
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 88e127003f..b9b2612cb1 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1170,14 +1170,12 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) {
         symbol_entities_futs.emplace_back(
                 std::move(opt_index_key_futs[idx]).thenValue([store = store(),
-                                                                     idx,
-                                                                     &stream_ids,
-                                                                     &version_queries,
-                                                                     read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
-                                                                     &read_options,
-                                                                     &component_manager,
-                                                                     pipeline_context,
-                                                                     norm_meta_mtx](auto&& opt_index_key) mutable {
+                                                              idx,
+                                                              read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
+                                                              &read_options,
+                                                              &component_manager,
+                                                              pipeline_context,
+                                                              norm_meta_mtx](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
                     internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
                     auto index_key_seg = store->read_sync(*opt_index_key).second;
@@ -1190,6 +1188,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                 })
         );
     }
+//    auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor());
+
     return folly::collect(symbol_entities_futs)
             .via(&async::cpu_executor())
             .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index 8d6af65072..203753b02d 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -640,6 +640,34 @@ std::shared_ptr<std::vector<folly::Future<std::vector<EntityId>>>> schedule_firs
     return futures;
 }
 
+folly::Future<std::vector<EntityId>> schedule_remaining_iterations(
+        folly::Future<std::vector<std::vector<EntityId>>>&& entity_ids_vec_fut,
+        std::shared_ptr<std::vector<std::shared_ptr<Clause>>> clauses
+        ) {
+    const auto scheduling_iterations = num_scheduling_iterations(*clauses);
+    for (auto i = 1UL; i < scheduling_iterations; ++i) {
+        entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
+            ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations);
+
+            util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations);
+            remove_processed_clauses(*clauses);
+            auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors));
+
+            std::vector<folly::Future<std::vector<EntityId>>> work_futures;
+            for(auto&& unit_of_work : next_units_of_work) {
+                ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling work for entity ids: {}", unit_of_work);
+                work_futures.emplace_back(async::submit_cpu_task(async::MemSegmentProcessingTask{*clauses, std::move(unit_of_work)}));
+            }
+
+            return folly::collect(work_futures).via(&async::io_executor());
+        });
+    }
+
+    return std::move(entity_ids_vec_fut).thenValueInline([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
+        return flatten_entities(std::move(entity_id_vectors));
+    });
+}
+
 folly::Future<std::vector<EntityId>> schedule_clause_processing(
         std::shared_ptr<ComponentManager> component_manager,
         std::vector<folly::Future<pipelines::SegmentAndSlice>>&& segment_and_slice_futures,
@@ -672,29 +700,7 @@ folly::Future<std::vector<EntityId>> schedule_clause_processing(
         clauses);
 
     auto entity_ids_vec_fut = folly::collect(*futures).via(&async::io_executor());
-
-    const auto scheduling_iterations = num_scheduling_iterations(*clauses);
-    for (auto i = 1UL; i < scheduling_iterations; ++i) {
-        entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
-            ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations);
-
-            util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations);
-            remove_processed_clauses(*clauses);
-            auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors));
-
-            std::vector<folly::Future<std::vector<EntityId>>> work_futures;
-            for(auto&& unit_of_work : next_units_of_work) {
-                ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling work for entity ids: {}", unit_of_work);
-                work_futures.emplace_back(async::submit_cpu_task(async::MemSegmentProcessingTask{*clauses, std::move(unit_of_work)}));
-            }
-
-            return folly::collect(work_futures).via(&async::io_executor());
-        });
-    }
-
-    return std::move(entity_ids_vec_fut).thenValueInline([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
-        return flatten_entities(std::move(entity_id_vectors));
-    });
+    return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses);
 }
 
 void set_output_descriptors(
@@ -2112,7 +2118,7 @@ folly::Future<std::vector<EntityId>> read_entity_ids_for_version(
     generate_filtered_field_descriptors(pipeline_context, read_query->columns);
     ARCTICDB_DEBUG(log::version(), "Fetching data to frame");
 
-    return version_store::do_process(store, read_query, read_options, pipeline_context, component_manager);
+    return do_process(store, read_query, read_options, pipeline_context, component_manager);
 }
 } //namespace arcticdb::version_store
 
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 57b541027c..d2888e48ca 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -32,9 +32,9 @@ def test_symbol_concat_basic(lmdb_library):
 
     lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
-    # lazy_df.resample("2D").agg({"col": "sum"})
+    lazy_df.resample("2D").agg({"col": "sum"})
 
     received = lazy_df.collect().data
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]) #.resample("2D").agg({"col": "sum"})
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"})
     assert_frame_equal(expected, received)
 

From 82b2290cbdb85f3c7d2ecbfee8db274ac9726373 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Fri, 24 Jan 2025 10:28:55 +0000
Subject: [PATCH 13/36] Added ConcatClause

---
 cpp/arcticdb/processing/clause.cpp            | 56 +++++++++++++++++++
 cpp/arcticdb/processing/clause.hpp            | 31 ++++++++++
 cpp/arcticdb/processing/clause_utils.hpp      |  3 +-
 .../version/local_versioned_engine.cpp        | 49 +++++++++++-----
 .../version/local_versioned_engine.hpp        |  3 +-
 cpp/arcticdb/version/python_bindings.cpp      | 13 ++++-
 cpp/arcticdb/version/version_core.cpp         | 18 ++++--
 cpp/arcticdb/version/version_core.hpp         |  5 ++
 cpp/arcticdb/version/version_store_api.cpp    |  5 +-
 cpp/arcticdb/version/version_store_api.hpp    |  3 +-
 python/arcticdb/version_store/library.py      |  6 +-
 .../test_symbol_concatenation.py              | 12 ++--
 12 files changed, 168 insertions(+), 36 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index a69d9a671a..d0a9a282da 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1217,4 +1217,60 @@ std::string DateRangeClause::to_string() const {
     return fmt::format("DATE RANGE {} - {}", start_, end_);
 }
 
+ConcatClause::ConcatClause() {
+    clause_info_.input_structure_ = ProcessingStructure::MULTI_SYMBOL;
+}
+
+std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec) {
+    auto entity_ids = flatten_entities(std::move(entity_ids_vec));
+    if (entity_ids.empty()) {
+        return {};
+    }
+    auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
+
+    // Map from old row ranges to new ones
+    std::map<RowRange, RowRange> row_range_mapping;
+    for (const auto& row_range: old_row_ranges) {
+        // Value is same as key initially
+        row_range_mapping.insert({*row_range, *row_range});
+    }
+    bool first_range{true};
+    size_t prev_range_end{0};
+    for (auto& [old_range, new_range]: row_range_mapping) {
+        if (first_range) {
+            // Make the first row-range start from zero
+            new_range.first = 0;
+            new_range.second = old_range.diff();
+            first_range = false;
+        } else {
+            new_range.first = prev_range_end;
+            new_range.second = new_range.first + old_range.diff();
+        }
+        prev_range_end = new_range.second;
+    }
+
+    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
+    new_row_ranges.reserve(old_row_ranges.size());
+    std::vector<RangesAndEntity> ranges_and_entities;
+    ranges_and_entities.reserve(entity_ids.size());
+    for (size_t idx=0; idx<entity_ids.size(); ++idx) {
+        auto new_row_range = std::make_shared<RowRange>(row_range_mapping.at(*old_row_ranges[idx]));
+        ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]);
+        new_row_ranges.emplace_back(std::move(new_row_range));
+    }
+
+    component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
+
+    auto new_structure_offsets = structure_by_row_slice(ranges_and_entities);
+    return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities);
+}
+
+std::vector<EntityId> ConcatClause::process(std::vector<EntityId>&& entity_ids) const {
+    return std::move(entity_ids);
+}
+
+std::string ConcatClause::to_string() const {
+    return "CONCAT";
+}
+
 }
diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp
index 134780d8db..004cbedc0f 100644
--- a/cpp/arcticdb/processing/clause.hpp
+++ b/cpp/arcticdb/processing/clause.hpp
@@ -669,4 +669,35 @@ struct DateRangeClause {
     [[nodiscard]] std::string to_string() const;
 };
 
+struct ConcatClause {
+
+    ClauseInfo clause_info_;
+    std::shared_ptr<ComponentManager> component_manager_;
+
+    ConcatClause();
+
+    ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause)
+
+    [[nodiscard]] std::vector<std::vector<size_t>> structure_for_processing(ARCTICDB_UNUSED std::vector<RangesAndKey>& ranges_and_keys) {
+        internal::raise<ErrorCode::E_ASSERTION_FAILURE>("ConcatClause should never be first in the pipeline");
+    }
+
+    [[nodiscard]] std::vector<std::vector<EntityId>> structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec);
+
+    [[nodiscard]] std::vector<EntityId> process(std::vector<EntityId>&& entity_ids) const;
+
+    [[nodiscard]] const ClauseInfo& clause_info() const {
+        return clause_info_;
+    }
+
+    void set_processing_config(ARCTICDB_UNUSED const ProcessingConfig& processing_config) {
+    }
+
+    void set_component_manager(std::shared_ptr<ComponentManager> component_manager) {
+        component_manager_ = component_manager;
+    }
+
+    [[nodiscard]] std::string to_string() const;
+};
+
 }//namespace arcticdb
diff --git a/cpp/arcticdb/processing/clause_utils.hpp b/cpp/arcticdb/processing/clause_utils.hpp
index aa3ef0c1d1..a7c9446b66 100644
--- a/cpp/arcticdb/processing/clause_utils.hpp
+++ b/cpp/arcticdb/processing/clause_utils.hpp
@@ -28,7 +28,8 @@ enum class ProcessingStructure {
     ROW_SLICE,
     TIME_BUCKETED,
     HASH_BUCKETED,
-    ALL
+    ALL,
+    MULTI_SYMBOL
 };
 
 struct KeepCurrentIndex{};
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index b9b2612cb1..8c4a3e9c93 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1152,13 +1152,12 @@ std::vector<std::variant<ReadVersionOutput, DataError>> LocalVersionedEngine::ba
 }
 
 MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
-        ARCTICDB_UNUSED const std::vector<StreamId>& stream_ids,
-        ARCTICDB_UNUSED const std::vector<VersionQuery>& version_queries,
-        ARCTICDB_UNUSED std::vector<std::shared_ptr<ReadQuery>>& read_queries,
-        ARCTICDB_UNUSED const ReadOptions& read_options,
-        ARCTICDB_UNUSED const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-        ARCTICDB_UNUSED std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
-        ARCTICDB_UNUSED std::any& handler_data) {
+        const std::vector<StreamId>& stream_ids,
+        const std::vector<VersionQuery>& version_queries,
+        std::vector<std::shared_ptr<ReadQuery>>& read_queries,
+        const ReadOptions& read_options,
+        std::vector<std::shared_ptr<Clause>>&& clauses,
+        std::any& handler_data) {
     py::gil_scoped_release release_gil;
     auto opt_index_key_futs = batch_get_versions_async(store(), version_map(), stream_ids, version_queries);
     std::vector<folly::Future<std::vector<EntityId>>> symbol_entities_futs;
@@ -1188,13 +1187,14 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                 })
         );
     }
-//    auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor());
+    auto entity_ids_vec_fut = folly::collect(symbol_entities_futs).via(&async::io_executor());
+    for (auto& clause: clauses) {
+        clause->set_component_manager(component_manager);
+    }
+    auto clauses_ptr = std::make_shared<std::vector<std::shared_ptr<Clause>>>(std::move(clauses));
 
-    return folly::collect(symbol_entities_futs)
-            .via(&async::cpu_executor())
-            .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
-        return flatten_entities(std::move(entity_id_vectors));
-    }).thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
+    return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses_ptr, true)
+    .thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
         auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
         // TODO: Handle output descriptors
         return collect_segments(std::move(proc));
@@ -1212,6 +1212,29 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                                       shared_data.buffers()}};
         });
     }).get();
+
+//    return folly::collect(symbol_entities_futs)
+//            .via(&async::cpu_executor())
+//            .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
+//        return flatten_entities(std::move(entity_id_vectors));
+//    }).thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
+//        auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
+//        // TODO: Handle output descriptors
+//        return collect_segments(std::move(proc));
+//    }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
+//        internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
+//        pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor());
+//        return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
+//    }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable {
+//        return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
+//        .thenValue([pipeline_context, frame, shared_data](auto&&) mutable {
+//            return MultiSymbolReadOutput{{},
+//                                     {frame,
+//                                      timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_),
+//                                      {},
+//                                      shared_data.buffers()}};
+//        });
+//    }).get();
 }
 
 void LocalVersionedEngine::write_version_and_prune_previous(
diff --git a/cpp/arcticdb/version/local_versioned_engine.hpp b/cpp/arcticdb/version/local_versioned_engine.hpp
index fbc01ff899..c3aea05425 100644
--- a/cpp/arcticdb/version/local_versioned_engine.hpp
+++ b/cpp/arcticdb/version/local_versioned_engine.hpp
@@ -291,8 +291,7 @@ class LocalVersionedEngine : public VersionedEngine {
             const std::vector<VersionQuery>& version_queries,
             std::vector<std::shared_ptr<ReadQuery>>& read_queries,
             const ReadOptions& read_options,
-            const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-            std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
+            std::vector<std::shared_ptr<Clause>>&& clauses,
             std::any& handler_data);
 
     std::vector<std::variant<DescriptorItem, DataError>> batch_read_descriptor_internal(
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index 4742592718..e2d99bea7d 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -402,6 +402,10 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
             .def_property_readonly("end", &DateRangeClause::end)
             .def("__str__", &DateRangeClause::to_string);
 
+    py::class_<ConcatClause, std::shared_ptr<ConcatClause>>(version, "ConcatClause")
+            .def(py::init<>())
+            .def("__str__", &ConcatClause::to_string);
+
     py::class_<ReadQuery, std::shared_ptr<ReadQuery>>(version, "PythonVersionStoreReadQuery")
             .def(py::init())
             .def_readwrite("columns",&ReadQuery::columns)
@@ -773,20 +777,25 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  const std::vector<VersionQuery>& version_queries,
                  std::vector<std::shared_ptr<ReadQuery>>& read_queries,
                  const ReadOptions& read_options,
-                 const std::string& join, // TODO: Make a Clause or MultiSymbolClause
+                 ClauseVariant join,
                  std::vector<ClauseVariant> post_join_clauses
                  ){
                  auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
                  // TODO: Remove duplication with ReadQuery interface
                  post_join_clauses = plan_query(std::move(post_join_clauses));
                  std::vector<std::shared_ptr<Clause>> _clauses;
+                 // TODO: Verify at some point that this is a multi-symbol clause
+                 util::variant_match(
+                         join,
+                         [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
+                 );
                  for (auto&& clause: post_join_clauses) {
                      util::variant_match(
                              clause,
                              [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
                      );
                  }
-                 return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, join, std::move(_clauses), handler_data));
+                 return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, std::move(_clauses), handler_data));
              },
              py::call_guard<SingleThreadMutexHolder>(), "Read a dataframe from the store")
         .def("batch_read_keys",
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index 203753b02d..3749f46cff 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -642,15 +642,25 @@ std::shared_ptr<std::vector<folly::Future<std::vector<EntityId>>>> schedule_firs
 
 folly::Future<std::vector<EntityId>> schedule_remaining_iterations(
         folly::Future<std::vector<std::vector<EntityId>>>&& entity_ids_vec_fut,
-        std::shared_ptr<std::vector<std::shared_ptr<Clause>>> clauses
+        std::shared_ptr<std::vector<std::shared_ptr<Clause>>> clauses,
+        const bool from_join
         ) {
-    const auto scheduling_iterations = num_scheduling_iterations(*clauses);
+    auto scheduling_iterations = num_scheduling_iterations(*clauses);
+    if (from_join) {
+        ++scheduling_iterations;
+    }
     for (auto i = 1UL; i < scheduling_iterations; ++i) {
-        entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
+        entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
             ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations);
 
             util::check(!clauses->empty(), "Scheduling iteration {} has no clauses to process", scheduling_iterations);
-            remove_processed_clauses(*clauses);
+            if (i == 1) {
+                if (!from_join) {
+                    remove_processed_clauses(*clauses);
+                }
+            } else {
+                remove_processed_clauses(*clauses);
+            }
             auto next_units_of_work = clauses->front()->structure_for_processing(std::move(entity_id_vectors));
 
             std::vector<folly::Future<std::vector<EntityId>>> work_futures;
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index 8d5aaec183..408d41acb7 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -144,6 +144,11 @@ folly::Future<ReadVersionOutput> read_multi_key(
     const SegmentInMemory& index_key_seg,
     std::any& handler_data);
 
+folly::Future<std::vector<EntityId>> schedule_remaining_iterations(
+    folly::Future<std::vector<std::vector<EntityId>>>&& entity_ids_vec_fut,
+    std::shared_ptr<std::vector<std::shared_ptr<Clause>>> clauses,
+    const bool from_join = false);
+
 folly::Future<std::vector<EntityId>> schedule_clause_processing(
     std::shared_ptr<ComponentManager> component_manager,
     std::vector<folly::Future<pipelines::SegmentAndSlice>>&& segment_and_slice_futures,
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index 2be57afaf0..ddb299b81e 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -792,10 +792,9 @@ ReadResult PythonVersionStore::batch_read_with_join(
         const std::vector<VersionQuery>& version_queries,
         std::vector<std::shared_ptr<ReadQuery>>& read_queries,
         const ReadOptions& read_options,
-        const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-        std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
+        std::vector<std::shared_ptr<Clause>>&& clauses,
         std::any& handler_data) {
-    auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, join, std::move(post_join_clauses), handler_data);
+    auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data);
     // TODO: Decide what to do with VersionedItem part of read result
     return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_));
 }
diff --git a/cpp/arcticdb/version/version_store_api.hpp b/cpp/arcticdb/version/version_store_api.hpp
index 2a834f6eda..e24f38113a 100644
--- a/cpp/arcticdb/version/version_store_api.hpp
+++ b/cpp/arcticdb/version/version_store_api.hpp
@@ -305,8 +305,7 @@ class PythonVersionStore : public LocalVersionedEngine {
             const std::vector<VersionQuery>& version_queries,
             std::vector<std::shared_ptr<ReadQuery>>& read_queries,
             const ReadOptions& read_options,
-            const std::string& join, // TODO: Make a Clause or MultiSymbolClause
-            std::vector<std::shared_ptr<Clause>>&& post_join_clauses,
+            std::vector<std::shared_ptr<Clause>>&& clauses,
             std::any& handler_data);
 
     std::vector<std::variant<std::pair<VersionedItem, py::object>, DataError>> batch_read_metadata(
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index a2e79ccf7f..f8bca21629 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -22,7 +22,7 @@
 from arcticdb.version_store.processing import ExpressionNode, QueryBuilder
 from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput
 from arcticdb_ext.exceptions import ArcticException
-from arcticdb_ext.version_store import DataError
+from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause
 import pandas as pd
 import numpy as np
 import logging
@@ -497,7 +497,7 @@ def collect(self) -> VersionedItem:
 
     def __str__(self) -> str:
         query_builder_repr = super().__str__()
-        return f"LazyDataFrameAfterJoin(Concat({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})"
+        return f"LazyDataFrameAfterJoin({self._join}({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})"
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -506,7 +506,7 @@ def __repr__(self) -> str:
 def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin:
     if not isinstance(lazy_dataframes, LazyDataFrameCollection):
         lazy_dataframes = LazyDataFrameCollection(lazy_dataframes)
-    return LazyDataFrameAfterJoin(lazy_dataframes, "concat")
+    return LazyDataFrameAfterJoin(lazy_dataframes, _ConcatClause())
 
 
 def col(name: str) -> ExpressionNode:
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index d2888e48ca..e87da49d33 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -18,23 +18,23 @@
 
 def test_symbol_concat_basic(lmdb_library):
     lib = lmdb_library
-    df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range("2025-01-01", periods=3))
-    df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range("2025-01-03", periods=4))
-    df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range("2025-03-07", periods=5))
+    df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3))
+    df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4))
+    df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5))
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
     lib.write("sym3", df_3)
 
     lazy_df_1 = lib.read("sym1", lazy=True)
     lazy_df_2 = lib.read("sym2", lazy=True)
-    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp("2025-01-04"), None))
+    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
     lazy_df_3 = lib.read("sym3", lazy=True)
 
     lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
-    lazy_df.resample("2D").agg({"col": "sum"})
+    lazy_df.resample("2000ns").agg({"col": "sum"})
 
     received = lazy_df.collect().data
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2D").agg({"col": "sum"})
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"})
     assert_frame_equal(expected, received)
 

From 727cd34b0cd249de63b0078dba3772aa38e303d3 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Fri, 24 Jan 2025 11:13:03 +0000
Subject: [PATCH 14/36] Fix ConcatClause::structure_for_processing

---
 cpp/arcticdb/processing/clause.cpp        | 30 ++++++++++-------------
 cpp/arcticdb/processing/query_planner.hpp |  3 ++-
 cpp/arcticdb/version/python_bindings.cpp  | 10 +-------
 cpp/arcticdb/version/version_core.cpp     |  3 ---
 4 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index d0a9a282da..ffdf619721 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1228,35 +1228,31 @@ std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::v
     }
     auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
 
-    // Map from old row ranges to new ones
-    std::map<RowRange, RowRange> row_range_mapping;
-    for (const auto& row_range: old_row_ranges) {
-        // Value is same as key initially
-        row_range_mapping.insert({*row_range, *row_range});
-    }
+    // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different
     bool first_range{true};
     size_t prev_range_end{0};
-    for (auto& [old_range, new_range]: row_range_mapping) {
+    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
+    new_row_ranges.reserve(old_row_ranges.size());
+    // TODO: Handle column-slicing
+    for (const auto& old_range: old_row_ranges) {
+        auto new_range = std::make_shared<RowRange>();
         if (first_range) {
             // Make the first row-range start from zero
-            new_range.first = 0;
-            new_range.second = old_range.diff();
+            new_range->first = 0;
+            new_range->second = old_range->diff();
             first_range = false;
         } else {
-            new_range.first = prev_range_end;
-            new_range.second = new_range.first + old_range.diff();
+            new_range->first = prev_range_end;
+            new_range->second = new_range->first + old_range->diff();
         }
-        prev_range_end = new_range.second;
+        prev_range_end = new_range->second;
+        new_row_ranges.emplace_back(std::move(new_range));
     }
 
-    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
-    new_row_ranges.reserve(old_row_ranges.size());
     std::vector<RangesAndEntity> ranges_and_entities;
     ranges_and_entities.reserve(entity_ids.size());
     for (size_t idx=0; idx<entity_ids.size(); ++idx) {
-        auto new_row_range = std::make_shared<RowRange>(row_range_mapping.at(*old_row_ranges[idx]));
-        ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]);
-        new_row_ranges.emplace_back(std::move(new_row_range));
+        ranges_and_entities.emplace_back(entity_ids[idx], new_row_ranges[idx], col_ranges[idx]);
     }
 
     component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
diff --git a/cpp/arcticdb/processing/query_planner.hpp b/cpp/arcticdb/processing/query_planner.hpp
index 99106654e1..b16b79fb40 100644
--- a/cpp/arcticdb/processing/query_planner.hpp
+++ b/cpp/arcticdb/processing/query_planner.hpp
@@ -22,7 +22,8 @@ using ClauseVariant = std::variant<std::shared_ptr<FilterClause>,
         std::shared_ptr<ResampleClause<ResampleBoundary::LEFT>>,
         std::shared_ptr<ResampleClause<ResampleBoundary::RIGHT>>,
         std::shared_ptr<RowRangeClause>,
-        std::shared_ptr<DateRangeClause>>;
+        std::shared_ptr<DateRangeClause>,
+        std::shared_ptr<ConcatClause>>;
 
 std::vector<ClauseVariant> plan_query(std::vector<ClauseVariant>&& clauses);
 
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index e2d99bea7d..c191f84618 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -414,15 +414,7 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
             .def_readonly("needs_post_processing",&ReadQuery::needs_post_processing)
             // Unsurprisingly, pybind11 doesn't understand folly::poly, so use vector of variants here
             .def("add_clauses",
-                 [](ReadQuery& self,
-                    std::vector<std::variant<std::shared_ptr<FilterClause>,
-                                std::shared_ptr<ProjectClause>,
-                                std::shared_ptr<GroupByClause>,
-                                std::shared_ptr<AggregationClause>,
-                                std::shared_ptr<ResampleClause<ResampleBoundary::LEFT>>,
-                                std::shared_ptr<ResampleClause<ResampleBoundary::RIGHT>>,
-                                std::shared_ptr<RowRangeClause>,
-                                std::shared_ptr<DateRangeClause>>> clauses) {
+                [](ReadQuery& self, std::vector<ClauseVariant> clauses) {
                 clauses = plan_query(std::move(clauses));
                 std::vector<std::shared_ptr<Clause>> _clauses;
                 self.needs_post_processing = false;
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index 3749f46cff..61adb4a71b 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -646,9 +646,6 @@ folly::Future<std::vector<EntityId>> schedule_remaining_iterations(
         const bool from_join
         ) {
     auto scheduling_iterations = num_scheduling_iterations(*clauses);
-    if (from_join) {
-        ++scheduling_iterations;
-    }
     for (auto i = 1UL; i < scheduling_iterations; ++i) {
         entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
             ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations);

From 139f3af3c57c3c884e47c0102738a208ea280f9c Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Fri, 24 Jan 2025 11:49:26 +0000
Subject: [PATCH 15/36] First end-to-end test passing!

---
 cpp/arcticdb/version/version_core.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index 61adb4a71b..3749f46cff 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -646,6 +646,9 @@ folly::Future<std::vector<EntityId>> schedule_remaining_iterations(
         const bool from_join
         ) {
     auto scheduling_iterations = num_scheduling_iterations(*clauses);
+    if (from_join) {
+        ++scheduling_iterations;
+    }
     for (auto i = 1UL; i < scheduling_iterations; ++i) {
         entity_ids_vec_fut = std::move(entity_ids_vec_fut).thenValue([clauses, scheduling_iterations, i, from_join] (std::vector<std::vector<EntityId>>&& entity_id_vectors) {
             ARCTICDB_RUNTIME_DEBUG(log::memory(), "Scheduling iteration {} of {}", i, scheduling_iterations);

From 548e5b763690482eefe11cc1de7f28519246e82c Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:08:34 +0000
Subject: [PATCH 16/36] Removed TODOs

---
 cpp/arcticdb/processing/clause.cpp            |  1 -
 .../version/local_versioned_engine.cpp        | 24 -------------------
 cpp/arcticdb/version/python_bindings.cpp      |  2 --
 cpp/arcticdb/version/version_store_api.cpp    |  1 -
 4 files changed, 28 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index ffdf619721..2a8c919a32 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1233,7 +1233,6 @@ std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::v
     size_t prev_range_end{0};
     std::vector<std::shared_ptr<RowRange>> new_row_ranges;
     new_row_ranges.reserve(old_row_ranges.size());
-    // TODO: Handle column-slicing
     for (const auto& old_range: old_row_ranges) {
         auto new_range = std::make_shared<RowRange>();
         if (first_range) {
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 8c4a3e9c93..01e7a96a1f 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1196,7 +1196,6 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     return schedule_remaining_iterations(std::move(entity_ids_vec_fut), clauses_ptr, true)
     .thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
         auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
-        // TODO: Handle output descriptors
         return collect_segments(std::move(proc));
     }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
         internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
@@ -1212,29 +1211,6 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                                       shared_data.buffers()}};
         });
     }).get();
-
-//    return folly::collect(symbol_entities_futs)
-//            .via(&async::cpu_executor())
-//            .thenValue([](std::vector<std::vector<EntityId>>&& entity_id_vectors) {
-//        return flatten_entities(std::move(entity_id_vectors));
-//    }).thenValueInline([component_manager](std::vector<EntityId>&& processed_entity_ids) {
-//        auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
-//        // TODO: Handle output descriptors
-//        return collect_segments(std::move(proc));
-//    }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
-//        internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
-//        pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor());
-//        return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
-//    }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable {
-//        return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
-//        .thenValue([pipeline_context, frame, shared_data](auto&&) mutable {
-//            return MultiSymbolReadOutput{{},
-//                                     {frame,
-//                                      timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_),
-//                                      {},
-//                                      shared_data.buffers()}};
-//        });
-//    }).get();
 }
 
 void LocalVersionedEngine::write_version_and_prune_previous(
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index c191f84618..1db69e0762 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -773,10 +773,8 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  std::vector<ClauseVariant> post_join_clauses
                  ){
                  auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
-                 // TODO: Remove duplication with ReadQuery interface
                  post_join_clauses = plan_query(std::move(post_join_clauses));
                  std::vector<std::shared_ptr<Clause>> _clauses;
-                 // TODO: Verify at some point that this is a multi-symbol clause
                  util::variant_match(
                          join,
                          [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index ddb299b81e..917d259526 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -795,7 +795,6 @@ ReadResult PythonVersionStore::batch_read_with_join(
         std::vector<std::shared_ptr<Clause>>&& clauses,
         std::any& handler_data) {
     auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data);
-    // TODO: Decide what to do with VersionedItem part of read result
     return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_));
 }
 

From f466fe227ad4c117358152d3942e75025dcdf267 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:23:47 +0000
Subject: [PATCH 17/36] Verify that clause for join is multi-symbol

---
 cpp/arcticdb/processing/clause.cpp       | 1 +
 cpp/arcticdb/processing/clause_utils.hpp | 2 ++
 cpp/arcticdb/version/python_bindings.cpp | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index 2a8c919a32..e1b083146f 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1219,6 +1219,7 @@ std::string DateRangeClause::to_string() const {
 
 ConcatClause::ConcatClause() {
     clause_info_.input_structure_ = ProcessingStructure::MULTI_SYMBOL;
+    clause_info_.multi_symbol_ = true;
 }
 
 std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec) {
diff --git a/cpp/arcticdb/processing/clause_utils.hpp b/cpp/arcticdb/processing/clause_utils.hpp
index a7c9446b66..84951730f7 100644
--- a/cpp/arcticdb/processing/clause_utils.hpp
+++ b/cpp/arcticdb/processing/clause_utils.hpp
@@ -53,6 +53,8 @@ struct ClauseInfo {
     std::variant<KeepCurrentIndex, KeepCurrentTopLevelIndex, NewIndex> index_{KeepCurrentIndex()};
     // Whether this clause modifies the output descriptor
     bool modifies_output_descriptor_{false};
+    // Whether this clause operates on one or multiple symbols
+    bool multi_symbol_{false};
 };
 
 // Changes how the clause behaves based on information only available after it is constructed
diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index 1db69e0762..f097678504 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -777,7 +777,12 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  std::vector<std::shared_ptr<Clause>> _clauses;
                  util::variant_match(
                          join,
-                         [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
+                         [&](auto&& clause) {
+                             user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
+                                     clause->clause_info().multi_symbol_,
+                                     "Single-symbol clause cannot be used to join multiple symbols together");
+                             _clauses.emplace_back(std::make_shared<Clause>(*clause));
+                         }
                  );
                  for (auto&& clause: post_join_clauses) {
                      util::variant_match(

From 096ff095b8e868b0dbe5c90cb2ffbb1d580b953f Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:40:38 +0000
Subject: [PATCH 18/36] Added failing test with column slicing

---
 python/tests/conftest.py                      |  7 +++
 .../test_symbol_concatenation.py              | 50 +++++++++++++++++--
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index c9e3f4a8d5..a198dbaf9d 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -136,6 +136,13 @@ def lmdb_library_static_dynamic(request):
     yield request.getfixturevalue(request.param)
 
 
+@pytest.fixture
+def lmdb_library_factory(lmdb_storage, lib_name):
+    def f(library_options: LibraryOptions = LibraryOptions()):
+        return lmdb_storage.create_arctic().create_library(lib_name, library_options=library_options)
+    return f
+
+
 # ssl is enabled by default to maximize test coverage as ssl is enabled most of the times in real world
 @pytest.fixture(scope="session")
 def s3_storage_factory() -> Generator[MotoS3StorageFixtureFactory, None, None]:
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index e87da49d33..68284b8aaa 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -10,14 +10,14 @@
 import pytest
 
 from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest
+from arcticdb.options import LibraryOptions
 from arcticdb.util.test import assert_frame_equal
 
-
 pytestmark = pytest.mark.pipeline
 
 
-def test_symbol_concat_basic(lmdb_library):
-    lib = lmdb_library
+def test_symbol_concat_basic(lmdb_library_factory):
+    lib = lmdb_library_factory()
     df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3))
     df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4))
     df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5))
@@ -38,3 +38,47 @@ def test_symbol_concat_basic(lmdb_library):
     expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"})
     assert_frame_equal(expected, received)
 
+
+def test_symbol_concat_column_sliced(lmdb_library_factory):
+    lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
+    df_1 = pd.DataFrame(
+        {
+            "col1": np.arange(3, dtype=np.int64),
+            "col2": np.arange(10, 13, dtype=np.int64),
+            "col3": np.arange(110, 113, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)
+    )
+    df_2 = pd.DataFrame(
+        {
+            "col1": np.arange(4, dtype=np.int64),
+            "col2": np.arange(20, 24, dtype=np.int64),
+            "col3": np.arange(210, 214, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)
+    )
+    df_3 = pd.DataFrame(
+        {
+            "col1": np.arange(5, dtype=np.int64),
+            "col2": np.arange(30, 35, dtype=np.int64),
+            "col3": np.arange(310, 315, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)
+    )
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+    lib.write("sym3", df_3)
+
+    lazy_df_1 = lib.read("sym1", lazy=True)
+    lazy_df_2 = lib.read("sym2", lazy=True)
+    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
+    lazy_df_3 = lib.read("sym3", lazy=True)
+
+    lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
+
+    lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+
+    received = lazy_df.collect().data
+    received = received.reindex(columns=sorted(received.columns))
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    assert_frame_equal(expected, received)

From 4b4cad63ef7154e363c0da102b233b924adefeb4 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:48:18 +0000
Subject: [PATCH 19/36] Simplify column slicing test

---
 .../test_symbol_concatenation.py              | 25 ++++---------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 68284b8aaa..0142728996 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -40,7 +40,8 @@ def test_symbol_concat_basic(lmdb_library_factory):
 
 
 def test_symbol_concat_column_sliced(lmdb_library_factory):
-    lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
+    # lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
+    lib = lmdb_library_factory()
     df_1 = pd.DataFrame(
         {
             "col1": np.arange(3, dtype=np.int64),
@@ -55,30 +56,14 @@ def test_symbol_concat_column_sliced(lmdb_library_factory):
             "col2": np.arange(20, 24, dtype=np.int64),
             "col3": np.arange(210, 214, dtype=np.int64),
         },
-        index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4)
-    )
-    df_3 = pd.DataFrame(
-        {
-            "col1": np.arange(5, dtype=np.int64),
-            "col2": np.arange(30, 35, dtype=np.int64),
-            "col3": np.arange(310, 315, dtype=np.int64),
-        },
-        index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5)
+        index=pd.date_range(pd.Timestamp(3000), freq="1000ns", periods=4)
     )
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
-    lib.write("sym3", df_3)
-
-    lazy_df_1 = lib.read("sym1", lazy=True)
-    lazy_df_2 = lib.read("sym2", lazy=True)
-    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
-    lazy_df_3 = lib.read("sym3", lazy=True)
-
-    lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
+    lazy_df = concat(lib.read_batch(["sym1", "sym2"], lazy=True))
     lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
-
     received = lazy_df.collect().data
     received = received.reindex(columns=sorted(received.columns))
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    expected = pd.concat([df_1, df_2]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
     assert_frame_equal(expected, received)

From d8de667ca408d7fe633af4d7cdd67809a01d7e35 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:48:45 +0000
Subject: [PATCH 20/36] Remove unneeded segments in
 ConcatClause::structure_for_processing

---
 cpp/arcticdb/processing/clause.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index e1b083146f..9cae430190 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1227,7 +1227,7 @@ std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::v
     if (entity_ids.empty()) {
         return {};
     }
-    auto [segments, old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
+    auto [old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
 
     // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different
     bool first_range{true};

From 7718bec059cf23df001689dcefbb9a6a44448878 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 10:58:13 +0000
Subject: [PATCH 21/36] Column slicing test passing

---
 cpp/arcticdb/processing/clause.cpp            | 95 +++++++++++++------
 .../test_symbol_concatenation.py              |  4 +-
 2 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index 9cae430190..6a2ed5c03e 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1223,42 +1223,79 @@ ConcatClause::ConcatClause() {
 }
 
 std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec) {
-    auto entity_ids = flatten_entities(std::move(entity_ids_vec));
-    if (entity_ids.empty()) {
-        return {};
-    }
-    auto [old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
-
     // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different
+    std::vector<RangesAndEntity> ranges_and_entities;
     bool first_range{true};
     size_t prev_range_end{0};
-    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
-    new_row_ranges.reserve(old_row_ranges.size());
-    for (const auto& old_range: old_row_ranges) {
-        auto new_range = std::make_shared<RowRange>();
-        if (first_range) {
-            // Make the first row-range start from zero
-            new_range->first = 0;
-            new_range->second = old_range->diff();
-            first_range = false;
-        } else {
-            new_range->first = prev_range_end;
-            new_range->second = new_range->first + old_range->diff();
+    for (const auto& entity_ids: entity_ids_vec) {
+        auto [old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
+        // Map from old row ranges WITHIN THIS SYMBOL to new ones
+        std::map<RowRange, RowRange> row_range_mapping;
+        for (const auto& row_range: old_row_ranges) {
+            // Value is same as key initially
+            row_range_mapping.insert({*row_range, *row_range});
         }
-        prev_range_end = new_range->second;
-        new_row_ranges.emplace_back(std::move(new_range));
-    }
-
-    std::vector<RangesAndEntity> ranges_and_entities;
-    ranges_and_entities.reserve(entity_ids.size());
-    for (size_t idx=0; idx<entity_ids.size(); ++idx) {
-        ranges_and_entities.emplace_back(entity_ids[idx], new_row_ranges[idx], col_ranges[idx]);
+        for (auto& [old_range, new_range]: row_range_mapping) {
+            if (first_range) {
+                // Make the first row-range start from zero
+                new_range.first = 0;
+                new_range.second = old_range.diff();
+                first_range = false;
+            } else {
+                new_range.first = prev_range_end;
+                new_range.second = new_range.first + old_range.diff();
+            }
+            prev_range_end = new_range.second;
+        }
+        std::vector<std::shared_ptr<RowRange>> new_row_ranges;
+        new_row_ranges.reserve(old_row_ranges.size());
+        for (size_t idx=0; idx<entity_ids.size(); ++idx) {
+            auto new_row_range = std::make_shared<RowRange>(row_range_mapping.at(*old_row_ranges[idx]));
+            ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]);
+            new_row_ranges.emplace_back(std::move(new_row_range));
+        }
+        component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
     }
-
-    component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
-
     auto new_structure_offsets = structure_by_row_slice(ranges_and_entities);
     return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities);
+
+
+//    auto entity_ids = flatten_entities(std::move(entity_ids_vec));
+//    if (entity_ids.empty()) {
+//        return {};
+//    }
+//    auto [old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
+//
+//
+//    bool first_range{true};
+//    size_t prev_range_end{0};
+//    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
+//    new_row_ranges.reserve(old_row_ranges.size());
+//    for (const auto& old_range: old_row_ranges) {
+//        auto new_range = std::make_shared<RowRange>();
+//        if (first_range) {
+//            // Make the first row-range start from zero
+//            new_range->first = 0;
+//            new_range->second = old_range->diff();
+//            first_range = false;
+//        } else {
+//            new_range->first = prev_range_end;
+//            new_range->second = new_range->first + old_range->diff();
+//        }
+//        prev_range_end = new_range->second;
+//        new_row_ranges.emplace_back(std::move(new_range));
+//    }
+//
+//    std::vector<RangesAndEntity> ranges_and_entities;
+//    ranges_and_entities.reserve(entity_ids.size());
+//    for (size_t idx=0; idx<entity_ids.size(); ++idx) {
+//        ranges_and_entities.emplace_back(entity_ids[idx], new_row_ranges[idx], col_ranges[idx]);
+//    }
+//
+//    component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
+//
+//    auto new_structure_offsets = structure_by_row_slice(ranges_and_entities);
+//    return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities);
 }
 
 std::vector<EntityId> ConcatClause::process(std::vector<EntityId>&& entity_ids) const {
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 0142728996..6a4defccbd 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -40,8 +40,8 @@ def test_symbol_concat_basic(lmdb_library_factory):
 
 
 def test_symbol_concat_column_sliced(lmdb_library_factory):
-    # lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
-    lib = lmdb_library_factory()
+    lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
+    # lib = lmdb_library_factory()
     df_1 = pd.DataFrame(
         {
             "col1": np.arange(3, dtype=np.int64),

From f089c7c9a7f2601608a2c09a67dddae5fa7c8cd9 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 11:08:24 +0000
Subject: [PATCH 22/36] Refactor to only call
 ComponentManager::replace_entities once

---
 cpp/arcticdb/processing/clause.cpp | 44 ++----------------------------
 1 file changed, 3 insertions(+), 41 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index 6a2ed5c03e..c4344a3704 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -1225,6 +1225,7 @@ ConcatClause::ConcatClause() {
 std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::vector<std::vector<EntityId>>&& entity_ids_vec) {
     // Similar logic to RowRangeClause::structure_for_processing but as input row ranges come from multiple symbols it is slightly different
     std::vector<RangesAndEntity> ranges_and_entities;
+    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
     bool first_range{true};
     size_t prev_range_end{0};
     for (const auto& entity_ids: entity_ids_vec) {
@@ -1247,55 +1248,16 @@ std::vector<std::vector<EntityId>> ConcatClause::structure_for_processing(std::v
             }
             prev_range_end = new_range.second;
         }
-        std::vector<std::shared_ptr<RowRange>> new_row_ranges;
-        new_row_ranges.reserve(old_row_ranges.size());
+
         for (size_t idx=0; idx<entity_ids.size(); ++idx) {
             auto new_row_range = std::make_shared<RowRange>(row_range_mapping.at(*old_row_ranges[idx]));
             ranges_and_entities.emplace_back(entity_ids[idx], new_row_range, col_ranges[idx]);
             new_row_ranges.emplace_back(std::move(new_row_range));
         }
-        component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
     }
+    component_manager_->replace_entities<std::shared_ptr<RowRange>>(flatten_entities(std::move(entity_ids_vec)), new_row_ranges);
     auto new_structure_offsets = structure_by_row_slice(ranges_and_entities);
     return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities);
-
-
-//    auto entity_ids = flatten_entities(std::move(entity_ids_vec));
-//    if (entity_ids.empty()) {
-//        return {};
-//    }
-//    auto [old_row_ranges, col_ranges] = component_manager_->get_entities<std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(entity_ids, false);
-//
-//
-//    bool first_range{true};
-//    size_t prev_range_end{0};
-//    std::vector<std::shared_ptr<RowRange>> new_row_ranges;
-//    new_row_ranges.reserve(old_row_ranges.size());
-//    for (const auto& old_range: old_row_ranges) {
-//        auto new_range = std::make_shared<RowRange>();
-//        if (first_range) {
-//            // Make the first row-range start from zero
-//            new_range->first = 0;
-//            new_range->second = old_range->diff();
-//            first_range = false;
-//        } else {
-//            new_range->first = prev_range_end;
-//            new_range->second = new_range->first + old_range->diff();
-//        }
-//        prev_range_end = new_range->second;
-//        new_row_ranges.emplace_back(std::move(new_range));
-//    }
-//
-//    std::vector<RangesAndEntity> ranges_and_entities;
-//    ranges_and_entities.reserve(entity_ids.size());
-//    for (size_t idx=0; idx<entity_ids.size(); ++idx) {
-//        ranges_and_entities.emplace_back(entity_ids[idx], new_row_ranges[idx], col_ranges[idx]);
-//    }
-//
-//    component_manager_->replace_entities<std::shared_ptr<RowRange>>(entity_ids, new_row_ranges);
-//
-//    auto new_structure_offsets = structure_by_row_slice(ranges_and_entities);
-//    return offsets_to_entity_ids(new_structure_offsets, ranges_and_entities);
 }
 
 std::vector<EntityId> ConcatClause::process(std::vector<EntityId>&& entity_ids) const {

From fb46e0356a14ae0635e2afbc47dd5dc904cf4c24 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 11:11:13 +0000
Subject: [PATCH 23/36] Test with row slicing

---
 .../test_symbol_concatenation.py              | 34 +++----------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 6a4defccbd..87773317eb 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -16,8 +16,10 @@
 pytestmark = pytest.mark.pipeline
 
 
-def test_symbol_concat_basic(lmdb_library_factory):
-    lib = lmdb_library_factory()
+@pytest.mark.parametrize("rows_per_segment", [2, 100_000])
+@pytest.mark.parametrize("columns_per_segment", [2, 100_000])
+def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment):
+    lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
     df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3))
     df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4))
     df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5))
@@ -39,31 +41,3 @@ def test_symbol_concat_basic(lmdb_library_factory):
     assert_frame_equal(expected, received)
 
 
-def test_symbol_concat_column_sliced(lmdb_library_factory):
-    lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
-    # lib = lmdb_library_factory()
-    df_1 = pd.DataFrame(
-        {
-            "col1": np.arange(3, dtype=np.int64),
-            "col2": np.arange(10, 13, dtype=np.int64),
-            "col3": np.arange(110, 113, dtype=np.int64),
-        },
-        index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3)
-    )
-    df_2 = pd.DataFrame(
-        {
-            "col1": np.arange(4, dtype=np.int64),
-            "col2": np.arange(20, 24, dtype=np.int64),
-            "col3": np.arange(210, 214, dtype=np.int64),
-        },
-        index=pd.date_range(pd.Timestamp(3000), freq="1000ns", periods=4)
-    )
-    lib.write("sym1", df_1)
-    lib.write("sym2", df_2)
-
-    lazy_df = concat(lib.read_batch(["sym1", "sym2"], lazy=True))
-    lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
-    received = lazy_df.collect().data
-    received = received.reindex(columns=sorted(received.columns))
-    expected = pd.concat([df_1, df_2]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
-    assert_frame_equal(expected, received)

From 70a67fa648ea14e2661211c55f6d57376aaed997 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 11:17:55 +0000
Subject: [PATCH 24/36] Use folly::enumerate over index-based loop

---
 cpp/arcticdb/version/local_versioned_engine.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 01e7a96a1f..7431768606 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1166,15 +1166,14 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     auto pipeline_context = std::make_shared<PipelineContext>();
     auto norm_meta_mtx = std::make_shared<std::mutex>();
     DecodePathData shared_data;
-    for (auto idx = 0UL; idx < opt_index_key_futs.size(); ++idx) {
+    for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) {
         symbol_entities_futs.emplace_back(
-                std::move(opt_index_key_futs[idx]).thenValue([store = store(),
-                                                              idx,
-                                                              read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
-                                                              &read_options,
-                                                              &component_manager,
-                                                              pipeline_context,
-                                                              norm_meta_mtx](auto&& opt_index_key) mutable {
+                std::move(opt_index_key_fut).thenValue([store = store(),
+                                                        read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
+                                                        &read_options,
+                                                        &component_manager,
+                                                        pipeline_context,
+                                                        norm_meta_mtx](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
                     internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
                     auto index_key_seg = store->read_sync(*opt_index_key).second;

From 0fd27e34212ab7afd09dbcc24ea5001c83610381 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Mon, 27 Jan 2025 17:43:17 +0000
Subject: [PATCH 25/36] Improve schema checking

---
 cpp/arcticdb/processing/clause.cpp            |  3 +-
 .../version/local_versioned_engine.cpp        | 83 ++++++++++++++++++-
 .../test_symbol_concatenation.py              | 59 +++++++++++--
 3 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
index c4344a3704..76fa60831c 100644
--- a/cpp/arcticdb/processing/clause.cpp
+++ b/cpp/arcticdb/processing/clause.cpp
@@ -647,6 +647,7 @@ std::vector<EntityId> ResampleClause<closed_boundary>::process(std::vector<Entit
     SegmentInMemory seg;
     RowRange output_row_range(row_slices.front().row_ranges_->at(0)->start(),
                               row_slices.front().row_ranges_->at(0)->start() + output_index_column->row_count());
+    ColRange output_col_range(1, aggregators_.size() + 1);
     seg.add_column(scalar_field(DataType::NANOSECONDS_UTC64, index_column_name), output_index_column);
     seg.descriptor().set_index(IndexDescriptorImpl(1, IndexDescriptor::Type::TIMESTAMP));
     auto& string_pool = seg.string_pool();
@@ -675,7 +676,7 @@ std::vector<EntityId> ResampleClause<closed_boundary>::process(std::vector<Entit
         seg.add_column(scalar_field(aggregated_column->type().data_type(), aggregator.get_output_column_name().value), aggregated_column);
     }
     seg.set_row_data(output_index_column->row_count() - 1);
-    return push_entities(*component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range)));
+    return push_entities(*component_manager_, ProcessingUnit(std::move(seg), std::move(output_row_range), std::move(output_col_range)));
 }
 
 template<ResampleBoundary closed_boundary>
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 7431768606..13e188fc2c 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1151,6 +1151,84 @@ std::vector<std::variant<ReadVersionOutput, DataError>> LocalVersionedEngine::ba
     return read_versions_or_errors;
 }
 
+StreamDescriptor descriptor_from_segments(std::vector<SliceAndKey>& slice_and_keys) {
+    internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
+    std::sort(std::begin(slice_and_keys), std::end(slice_and_keys), [] (const auto& left, const auto& right) {
+        return std::tie(left.slice_.row_range, left.slice_.col_range) < std::tie(right.slice_.row_range, right.slice_.col_range);
+    });
+    // TODO: Make more memory efficient by only keeping first element in memory, and comparing others on the fly
+    // Track the field collections for each row-slice
+    std::vector<FieldCollection> field_collections;
+    RowRange current_row_range(std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max());
+    for (const auto& slice_and_key: slice_and_keys) {
+        const auto& desc = slice_and_key.segment_->descriptor();
+        if (slice_and_key.slice().rows() != current_row_range) {
+            field_collections.emplace_back();
+            for (size_t field_idx = 0; field_idx < desc.data().index().field_count(); ++field_idx) {
+                field_collections.back().add(desc.fields().ref_at(field_idx));
+            }
+            current_row_range = slice_and_key.slice().rows();
+        }
+        for (size_t field_idx = desc.data().index().field_count(); field_idx < desc.field_count(); ++field_idx) {
+            field_collections.back().add(desc.fields().ref_at(field_idx));
+        }
+    }
+    // Ensure they are all the same
+    // TODO: Relax this requirement
+    for (size_t idx = 1; idx < field_collections.size(); ++idx) {
+        schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+                field_collections[0] == field_collections[idx],
+                "Mismatching fields in multi-symbol join: {} != {}",
+                field_collections[0],
+                field_collections[idx]
+                );
+    }
+
+    // Set the output descriptor based on the first row slice
+    StreamDescriptor res = slice_and_keys[0].segment_->descriptor().clone();
+    res.fields_ = std::make_shared<FieldCollection>(std::move(field_collections[0]));
+
+//    auto index_field_count = res.data().index().field_count();
+//    const auto& first_row_range = slice_and_keys[0].slice().rows();
+//    for (size_t slice_idx = 1; slice_idx < slice_and_keys.size(); ++slice_idx) {
+//        const auto& slice_and_key = slice_and_keys[slice_idx];
+//        if (slice_and_key.slice().rows() == first_row_range) {
+//            const auto& desc = slice_and_key.segment_->descriptor();
+//            schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+//                    desc.data().index().field_count() == index_field_count,
+//                    "Mismatching index field count {} != {}",
+//                    desc.data().index().field_count(),
+//                    index_field_count);
+//            for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) {
+//                res.add_field(desc.fields().ref_at(field_idx));
+//            }
+//        } else {
+//            // For other row-slices, check that they match the first
+//            const auto& desc = slice_and_key.segment_->descriptor();
+//            schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+//                    desc.data().index().field_count() == index_field_count,
+//                    "Mismatching index field count {} != {}",
+//                    desc.data().index().field_count(),
+//                    index_field_count);
+//            for (size_t field_idx = 0; field_idx < index_field_count; ++field_idx) {
+//                schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+//                        desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx),
+//                        "Mismatching index fields: {} != {}",
+//                        desc.fields().ref_at(field_idx),
+//                        res.fields().ref_at(field_idx));
+//            }
+//            for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) {
+//                schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
+//                        desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count),
+//                        "Mismatching fields: {} != {}",
+//                        desc.fields().ref_at(field_idx),
+//                        res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count));
+//            }
+//        }
+//    }
+    return res;
+}
+
 MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         const std::vector<StreamId>& stream_ids,
         const std::vector<VersionQuery>& version_queries,
@@ -1179,7 +1257,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                     auto index_key_seg = store->read_sync(*opt_index_key).second;
                     {
                         std::lock_guard<std::mutex> lock(*norm_meta_mtx);
-                        pipeline_context->norm_meta_ =std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
+                        pipeline_context->norm_meta_ = std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
                     }
                     version_info = VersionedItem(std::move(*opt_index_key));
                     return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager);
@@ -1197,8 +1275,7 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
         return collect_segments(std::move(proc));
     }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
-        internal::check<ErrorCode::E_ASSERTION_FAILURE>(!slice_and_keys.empty(), "No slice and keys in batch_read_with_join_internal");
-        pipeline_context->set_descriptor(slice_and_keys[0].segment(store).descriptor());
+        pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys));
         return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
     }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable {
         return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 87773317eb..6da0c5ac72 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -10,6 +10,7 @@
 import pytest
 
 from arcticdb import col, concat, LazyDataFrame, LazyDataFrameCollection, QueryBuilder, ReadRequest
+from arcticdb.exceptions import SchemaException
 from arcticdb.options import LibraryOptions
 from arcticdb.util.test import assert_frame_equal
 
@@ -18,11 +19,32 @@
 
 @pytest.mark.parametrize("rows_per_segment", [2, 100_000])
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
-def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment):
+def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment):
     lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
-    df_1 = pd.DataFrame({"col": np.arange(3, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3))
-    df_2 = pd.DataFrame({"col": np.arange(4, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4))
-    df_3 = pd.DataFrame({"col": np.arange(5, dtype=np.int64)}, index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5))
+    df_1 = pd.DataFrame(
+        {
+        "col1": np.arange(3, dtype=np.int64),
+        "col2": np.arange(100, 103, dtype=np.int64),
+        "col3": np.arange(1000, 1003, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3),
+    )
+    df_2 = pd.DataFrame(
+        {
+        "col1": np.arange(4, dtype=np.int64),
+        "col2": np.arange(200, 204, dtype=np.int64),
+        "col3": np.arange(2000, 2004, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4),
+    )
+    df_3 = pd.DataFrame(
+        {
+        "col1": np.arange(5, dtype=np.int64),
+        "col2": np.arange(300, 305, dtype=np.int64),
+        "col3": np.arange(3000, 3005, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5),
+    )
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
     lib.write("sym3", df_3)
@@ -34,10 +56,35 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
 
     lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
-    lazy_df.resample("2000ns").agg({"col": "sum"})
+    lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
 
     received = lazy_df.collect().data
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col": "sum"})
+    received = received.reindex(columns=sorted(received.columns))
+    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
     assert_frame_equal(expected, received)
 
 
+def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory):
+    lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
+    df_1 = pd.DataFrame({"col1": [0], "col3": [1]})
+    df_2 = pd.DataFrame({"col2": [2], "col3": [3]})
+    df_3 = pd.DataFrame({"col1": [4], "col4": [5]})
+    df_4 = pd.DataFrame({"col1": [6], "col3": [7], "col5": [8], "col6": [9]})
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+    lib.write("sym3", df_3)
+    lib.write("sym4", df_4)
+
+    # First column different
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect()
+    # Second column different
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym1", "sym3"], lazy=True)).collect()
+    # First row slice with extra column slice
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym4", "sym1"], lazy=True)).collect()
+    # Second row slice with extra column slice
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect()
+

From 91c0d571bc696330fd6c0ba497cc1861cbefb9c1 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 15:19:08 +0000
Subject: [PATCH 26/36] Extra tests for non-identical columns between symbols

---
 .../version/local_versioned_engine.cpp        | 39 +------------------
 .../test_symbol_concatenation.py              | 17 +++++---
 2 files changed, 12 insertions(+), 44 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 13e188fc2c..a1503d857a 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1156,6 +1156,7 @@ StreamDescriptor descriptor_from_segments(std::vector<SliceAndKey>& slice_and_ke
     std::sort(std::begin(slice_and_keys), std::end(slice_and_keys), [] (const auto& left, const auto& right) {
         return std::tie(left.slice_.row_range, left.slice_.col_range) < std::tie(right.slice_.row_range, right.slice_.col_range);
     });
+    // TODO: Move this check to <join clause>::structure_for_processing, as it could be different for different clauses
     // TODO: Make more memory efficient by only keeping first element in memory, and comparing others on the fly
     // Track the field collections for each row-slice
     std::vector<FieldCollection> field_collections;
@@ -1188,44 +1189,6 @@ StreamDescriptor descriptor_from_segments(std::vector<SliceAndKey>& slice_and_ke
     StreamDescriptor res = slice_and_keys[0].segment_->descriptor().clone();
     res.fields_ = std::make_shared<FieldCollection>(std::move(field_collections[0]));
 
-//    auto index_field_count = res.data().index().field_count();
-//    const auto& first_row_range = slice_and_keys[0].slice().rows();
-//    for (size_t slice_idx = 1; slice_idx < slice_and_keys.size(); ++slice_idx) {
-//        const auto& slice_and_key = slice_and_keys[slice_idx];
-//        if (slice_and_key.slice().rows() == first_row_range) {
-//            const auto& desc = slice_and_key.segment_->descriptor();
-//            schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
-//                    desc.data().index().field_count() == index_field_count,
-//                    "Mismatching index field count {} != {}",
-//                    desc.data().index().field_count(),
-//                    index_field_count);
-//            for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) {
-//                res.add_field(desc.fields().ref_at(field_idx));
-//            }
-//        } else {
-//            // For other row-slices, check that they match the first
-//            const auto& desc = slice_and_key.segment_->descriptor();
-//            schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
-//                    desc.data().index().field_count() == index_field_count,
-//                    "Mismatching index field count {} != {}",
-//                    desc.data().index().field_count(),
-//                    index_field_count);
-//            for (size_t field_idx = 0; field_idx < index_field_count; ++field_idx) {
-//                schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
-//                        desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx),
-//                        "Mismatching index fields: {} != {}",
-//                        desc.fields().ref_at(field_idx),
-//                        res.fields().ref_at(field_idx));
-//            }
-//            for (size_t field_idx = index_field_count; field_idx < desc.field_count(); ++field_idx) {
-//                schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
-//                        desc.fields().ref_at(field_idx) == res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count),
-//                        "Mismatching fields: {} != {}",
-//                        desc.fields().ref_at(field_idx),
-//                        res.fields().ref_at(field_idx + slice_and_key.slice().columns().first - index_field_count));
-//            }
-//        }
-//    }
     return res;
 }
 
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 6da0c5ac72..f3a31a6bd1 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -64,16 +64,19 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
     assert_frame_equal(expected, received)
 
 
-def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory):
+@pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]])
+def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index):
     lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
-    df_1 = pd.DataFrame({"col1": [0], "col3": [1]})
-    df_2 = pd.DataFrame({"col2": [2], "col3": [3]})
-    df_3 = pd.DataFrame({"col1": [4], "col4": [5]})
-    df_4 = pd.DataFrame({"col1": [6], "col3": [7], "col5": [8], "col6": [9]})
+    df_1 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index)
+    df_2 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index)
+    df_3 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index)
+    df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index)
+    df_5 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index)
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
     lib.write("sym3", df_3)
     lib.write("sym4", df_4)
+    lib.write("sym5", df_5)
 
     # First column different
     with pytest.raises(SchemaException):
@@ -87,4 +90,6 @@ def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory):
     # Second row slice with extra column slice
     with pytest.raises(SchemaException):
         concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect()
-
+    # Row slices differ only in second column slice
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect()

From cf13b8f44672ae492f7984def7bf6524ad72c855 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 15:45:54 +0000
Subject: [PATCH 27/36] Test exception thrown with mixed index types

---
 .../test_symbol_concatenation.py              | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index f3a31a6bd1..eb8b841cf5 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -17,6 +17,46 @@
 pytestmark = pytest.mark.pipeline
 
 
+@pytest.mark.parametrize("rows_per_segment", [2, 100_000])
+@pytest.mark.parametrize("columns_per_segment", [2, 100_000])
+@pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)])
+def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment, index):
+    lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
+    df_1 = pd.DataFrame(
+        {
+            "col1": np.arange(3, dtype=np.int64),
+            "col2": np.arange(100, 103, dtype=np.int64),
+            "col3": np.arange(1000, 1003, dtype=np.int64),
+        },
+        index=index[:3] if index is not None else None,
+    )
+    df_2 = pd.DataFrame(
+        {
+            "col1": np.arange(4, dtype=np.int64),
+            "col2": np.arange(200, 204, dtype=np.int64),
+            "col3": np.arange(2000, 2004, dtype=np.int64),
+        },
+        index=index[3:7] if index is not None else None,
+    )
+    df_3 = pd.DataFrame(
+        {
+            "col1": np.arange(5, dtype=np.int64),
+            "col2": np.arange(300, 305, dtype=np.int64),
+            "col3": np.arange(3000, 3005, dtype=np.int64),
+        },
+        index=index[7:] if index is not None else None,
+    )
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+    lib.write("sym3", df_3)
+
+    received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data
+    expected = pd.concat([df_1, df_2, df_3])
+    if index is None:
+        expected.index = pd.RangeIndex(len(expected))
+    assert_frame_equal(expected, received)
+
+
 @pytest.mark.parametrize("rows_per_segment", [2, 100_000])
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
 def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment):
@@ -93,3 +133,36 @@ def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, inde
     # Row slices differ only in second column slice
     with pytest.raises(SchemaException):
         concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect()
+
+
+def test_symbol_concat_symbols_with_different_indexes(lmdb_library):
+    lib = lmdb_library
+    df_1 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1))
+    df_2 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)])
+    dt1 = pd.Timestamp(0)
+    dt2 = pd.Timestamp(1)
+    arr1 = [dt1, dt1, dt2, dt2]
+    arr2 = [0, 1, 0, 1]
+    df_3 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"]))
+
+    lib.write("range_index_sym", df_1)
+    lib.write("timestamp_index_sym", df_2)
+    lib.write("multiindex_sym", df_3)
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["range_index_sym", "timestamp_index_sym"], lazy=True)).collect()
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["timestamp_index_sym", "range_index_sym"], lazy=True)).collect()
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["range_index_sym", "multiindex_sym"], lazy=True)).collect()
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["multiindex_sym", "range_index_sym"], lazy=True)).collect()
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect()
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect()

From 3dc2020407ae00b02fd28724728c9d4fc4a7982d Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 15:52:40 +0000
Subject: [PATCH 28/36] Test concatenating multiindexed data

---
 .../test_symbol_concatenation.py              | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index eb8b841cf5..1b6e531fe9 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -57,6 +57,26 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
     assert_frame_equal(expected, received)
 
 
+@pytest.mark.parametrize("rows_per_segment", [2, 100_000])
+@pytest.mark.parametrize("columns_per_segment", [2, 100_000])
+def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, columns_per_segment):
+    lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
+    df = pd.DataFrame(
+        {
+            "col1": np.arange(12, dtype=np.int64),
+            "col2": np.arange(100, 112, dtype=np.int64),
+            "col3": np.arange(1000, 1012, dtype=np.int64),
+        },
+        index=pd.MultiIndex.from_product([pd.date_range("2025-01-01", periods=4), [0, 1, 2]], names=["datetime", "level"]),
+    )
+    lib.write("sym1", df[:3])
+    lib.write("sym2", df[3:7])
+    lib.write("sym3", df[7:])
+
+    received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data
+    assert_frame_equal(df, received)
+
+
 @pytest.mark.parametrize("rows_per_segment", [2, 100_000])
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
 def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment):

From e9f5eeb8597cc13e041005c46d7c4b8d82c0b5f8 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 16:04:00 +0000
Subject: [PATCH 29/36] Test pickled data behaviour

---
 cpp/arcticdb/version/local_versioned_engine.cpp       |  3 +++
 cpp/arcticdb/version/version_core.cpp                 |  4 ++--
 .../version_store/test_symbol_concatenation.py        | 11 +++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index a1503d857a..2aed2c2d77 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1216,10 +1216,13 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                                                         pipeline_context,
                                                         norm_meta_mtx](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
+                    // TODO: Add support for symbols that only have incomplete segments
                     internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
+                    // TODO: Only read the index segment once
                     auto index_key_seg = store->read_sync(*opt_index_key).second;
                     {
                         std::lock_guard<std::mutex> lock(*norm_meta_mtx);
+                        // TODO: Construct this at the end of the pipeline, instead of reusing input data
                         pipeline_context->norm_meta_ = std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
                     }
                     version_info = VersionedItem(std::move(*opt_index_key));
diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
index 3749f46cff..3012e325d5 100644
--- a/cpp/arcticdb/version/version_core.cpp
+++ b/cpp/arcticdb/version/version_core.cpp
@@ -1596,7 +1596,7 @@ folly::Future<std::vector<EntityId>> do_process(
         const ReadOptions& read_options,
         const std::shared_ptr<PipelineContext>& pipeline_context,
         std::shared_ptr<ComponentManager> component_manager) {
-    util::check_rte(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data");
+    schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(!pipeline_context->is_pickled(),"Cannot perform multi-symbol join on pickled data");
     return read_and_process_2(store, pipeline_context, read_query, read_options, component_manager);
 }
 
@@ -2110,7 +2110,7 @@ folly::Future<std::vector<EntityId>> read_entity_ids_for_version(
         read_indexed_keys_to_pipeline(store, pipeline_context, std::get<VersionedItem>(version_info), *read_query, read_options);
     }
 
-    user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!pipeline_context->multi_key_, "Multi-symbol joines not supported with recursively normalized data");
+    user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!pipeline_context->multi_key_, "Multi-symbol joins not supported with recursively normalized data");
 
     if(opt_false(read_options.incompletes_)) {
         util::check(std::holds_alternative<IndexRange>(read_query->row_filter), "Streaming read requires date range filter");
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 1b6e531fe9..8c8eb8c2d5 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -186,3 +186,14 @@ def test_symbol_concat_symbols_with_different_indexes(lmdb_library):
 
     with pytest.raises(SchemaException):
         concat(lib.read_batch(["timestamp_index_sym", "multiindex_sym"], lazy=True)).collect()
+
+
+def test_symbol_concat_pickled_data(lmdb_library):
+    lib = lmdb_library
+    df = pd.DataFrame({"bytes": np.arange(10, dtype=np.uint64)})
+    pickled_data = {"hi", "there"}
+    lib.write("sym1", df)
+    lib.write_pickle("sym2", pickled_data)
+
+    with pytest.raises(SchemaException):
+        concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect()

From 9ed8d7c8acc353811c330b38bbe0015a2955bab0 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 16:17:08 +0000
Subject: [PATCH 30/36] Test date_range provided via read arg and via clause

---
 .../version/local_versioned_engine.cpp        |  5 ++-
 .../test_symbol_concatenation.py              | 32 +++++++++++++++++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 2aed2c2d77..c720b12212 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1175,7 +1175,10 @@ StreamDescriptor descriptor_from_segments(std::vector<SliceAndKey>& slice_and_ke
         }
     }
     // Ensure they are all the same
-    // TODO: Relax this requirement
+    // TODO: Relax this requirement:
+    // - columns in different orders
+    // - columns of different but promotable types
+    // - extra/missing columns in different row slices
     for (size_t idx = 1; idx < field_collections.size(); ++idx) {
         schema::check<ErrorCode::E_DESCRIPTOR_MISMATCH>(
                 field_collections[0] == field_collections[idx],
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 8c8eb8c2d5..f53e2aebe6 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -77,6 +77,34 @@ def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, column
     assert_frame_equal(df, received)
 
 
+def test_symbol_concat_with_date_range(lmdb_library):
+    lib = lmdb_library
+    df_1 = pd.DataFrame(
+        {
+            "col1": np.arange(3, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3),
+    )
+    df_2 = pd.DataFrame(
+        {
+            "col1": np.arange(4, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(1000), freq="1000ns", periods=4),
+    )
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+
+    # Use date_range arg to trim last row from sym1
+    lazy_df_1 = lib.read("sym1", date_range=(None, pd.Timestamp(1000)), lazy=True)
+    # Use date_range clause to trim first row from sym2
+    lazy_df_2 = lib.read("sym2", lazy=True)
+    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(2000), None))
+
+    received = concat([lazy_df_1, lazy_df_2]).collect().data
+    expected = pd.concat([df_1[:2], df_2[1:]])
+    assert_frame_equal(expected, received)
+
+
 @pytest.mark.parametrize("rows_per_segment", [2, 100_000])
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
 def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment):
@@ -112,7 +140,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
     lazy_df_1 = lib.read("sym1", lazy=True)
     lazy_df_2 = lib.read("sym2", lazy=True)
     lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
-    lazy_df_3 = lib.read("sym3", lazy=True)
+    lazy_df_3 = lib.read("sym3", date_range=(None, pd.Timestamp(9000)), lazy=True)
 
     lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
 
@@ -120,7 +148,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
 
     received = lazy_df.collect().data
     received = received.reindex(columns=sorted(received.columns))
-    expected = pd.concat([df_1, df_2.iloc[1:], df_3]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    expected = pd.concat([df_1, df_2[1:], df_3[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
     assert_frame_equal(expected, received)
 
 

From 7c7e52fa86a30c0db8b7c4a8cacfec221e8f5851 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Wed, 29 Jan 2025 16:49:00 +0000
Subject: [PATCH 31/36] Add failing test with column slicing

---
 .../test_symbol_concatenation.py              | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index f53e2aebe6..e909f1296a 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -57,6 +57,40 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
     assert_frame_equal(expected, received)
 
 
+# TODO: Get working with column slicing
+@pytest.mark.xfail(reason="Not yet working with column slicing")
+@pytest.mark.parametrize("rows_per_segment", [2, 100_000])
+@pytest.mark.parametrize("columns_per_segment", [2, 100_000])
+@pytest.mark.parametrize("columns", [["col1"], ["col2"], ["col3"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]])
+def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, columns_per_segment, columns):
+    lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
+    df_1 = pd.DataFrame(
+        {
+            "col1": np.arange(3, dtype=np.int64),
+            "col2": np.arange(100, 103, dtype=np.int64),
+            "col3": np.arange(1000, 1003, dtype=np.int64),
+        },
+    )
+    df_2 = pd.DataFrame(
+        {
+            "col0": np.arange(10, 14, dtype=np.int64),
+            "col1": np.arange(4, dtype=np.int64),
+            "col2": np.arange(200, 204, dtype=np.int64),
+            "col3": np.arange(2000, 2004, dtype=np.int64),
+        },
+    )
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+
+    lazy_df_1 = lib.read("sym1", columns=columns, lazy=True)
+    lazy_df_2 = lib.read("sym2", columns=columns, lazy=True)
+
+    received = concat([lazy_df_1, lazy_df_2]).collect().data
+    expected = pd.concat([df_1.loc[:, columns], df_2.loc[:, columns]])
+    expected.index = pd.RangeIndex(len(expected))
+    assert_frame_equal(expected, received)
+
+
 @pytest.mark.parametrize("rows_per_segment", [2, 100_000])
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
 def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, columns_per_segment):

From 4280c562a4d12e41c9ccecda0f5a7aedd87e35d7 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Thu, 30 Jan 2025 11:17:11 +0000
Subject: [PATCH 32/36] Return list of symbol/version pairs of input symbols
 when joining

---
 cpp/arcticdb/entity/read_result.hpp           |   6 +-
 .../version/local_versioned_engine.cpp        |  15 ++-
 cpp/arcticdb/version/version_store_api.cpp    |   2 +-
 python/arcticdb/version_store/_store.py       |  47 +++++--
 python/arcticdb/version_store/library.py      |   6 +-
 .../test_symbol_concatenation.py              | 116 +++++++++---------
 6 files changed, 115 insertions(+), 77 deletions(-)

diff --git a/cpp/arcticdb/entity/read_result.hpp b/cpp/arcticdb/entity/read_result.hpp
index 526407f073..e05f2f3e41 100644
--- a/cpp/arcticdb/entity/read_result.hpp
+++ b/cpp/arcticdb/entity/read_result.hpp
@@ -21,7 +21,7 @@ namespace arcticdb {
 
 struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
     ReadResult(
-            const VersionedItem& versioned_item,
+            const std::variant<VersionedItem, std::vector<VersionedItem>>& versioned_item,
             pipelines::PythonOutputFrame&& frame_data,
             const arcticdb::proto::descriptors::NormalizationMetadata& norm_meta,
             const arcticdb::proto::descriptors::UserDefinedMetadata& user_meta,
@@ -35,7 +35,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
             multi_keys(std::move(multi_keys)) {
 
     }
-    VersionedItem item;
+    std::variant<VersionedItem, std::vector<VersionedItem>> item;
     pipelines::PythonOutputFrame frame_data;
     arcticdb::proto::descriptors::NormalizationMetadata norm_meta;
     arcticdb::proto::descriptors::UserDefinedMetadata user_meta;
@@ -46,7 +46,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
 };
 
 inline ReadResult create_python_read_result(
-    const VersionedItem& version,
+    const std::variant<VersionedItem, std::vector<VersionedItem>>& version,
     FrameAndDescriptor&& fd) {
     auto result = std::move(fd);
     // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index c720b12212..335c2adbe8 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1209,15 +1209,19 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     auto component_manager = std::make_shared<ComponentManager>();
     auto pipeline_context = std::make_shared<PipelineContext>();
     auto norm_meta_mtx = std::make_shared<std::mutex>();
+    // TODO: Generate this in a less hacky way
+    auto res_versioned_items = std::make_shared<std::vector<VersionedItem>>(stream_ids.size());
     DecodePathData shared_data;
     for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) {
         symbol_entities_futs.emplace_back(
                 std::move(opt_index_key_fut).thenValue([store = store(),
                                                         read_query = read_queries.empty() ? std::make_shared<ReadQuery>(): read_queries[idx],
+                                                        idx,
                                                         &read_options,
                                                         &component_manager,
                                                         pipeline_context,
-                                                        norm_meta_mtx](auto&& opt_index_key) mutable {
+                                                        norm_meta_mtx,
+                                                        res_versioned_items](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
                     // TODO: Add support for symbols that only have incomplete segments
                     internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
@@ -1228,6 +1232,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                         // TODO: Construct this at the end of the pipeline, instead of reusing input data
                         pipeline_context->norm_meta_ = std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
                     }
+                    // Shouldn't need mutex protecting as vector is presized and each element is only accessed once
+                    (*res_versioned_items)[idx] = VersionedItem(*opt_index_key);
                     version_info = VersionedItem(std::move(*opt_index_key));
                     return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager);
                 })
@@ -1244,12 +1250,13 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         auto proc = gather_entities<std::shared_ptr<SegmentInMemory>, std::shared_ptr<RowRange>, std::shared_ptr<ColRange>>(*component_manager, std::move(processed_entity_ids));
         return collect_segments(std::move(proc));
     }).thenValueInline([store=store(), &handler_data, pipeline_context](std::vector<SliceAndKey>&& slice_and_keys) {
+        // TODO: Make constructing the output descriptor a standard end of pipeline operation
         pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys));
         return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
-    }).thenValueInline([&handler_data, pipeline_context, shared_data](SegmentInMemory&& frame) mutable {
+    }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items](SegmentInMemory&& frame) mutable {
         return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
-        .thenValue([pipeline_context, frame, shared_data](auto&&) mutable {
-            return MultiSymbolReadOutput{{},
+        .thenValue([pipeline_context, frame, shared_data, res_versioned_items](auto&&) mutable {
+            return MultiSymbolReadOutput{std::move(*res_versioned_items),
                                      {frame,
                                       timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_),
                                       {},
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index 917d259526..283cfe7cba 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -795,7 +795,7 @@ ReadResult PythonVersionStore::batch_read_with_join(
         std::vector<std::shared_ptr<Clause>>&& clauses,
         std::any& handler_data) {
     auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data);
-    return create_python_read_result({}, std::move(versions_and_frame.frame_and_descriptor_));
+    return create_python_read_result(versions_and_frame.versioned_items_, std::move(versions_and_frame.frame_and_descriptor_));
 }
 
 void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) {
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index 32a32360a5..0f488a1343 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -7,6 +7,7 @@
 """
 
 import copy
+from dataclasses import dataclass
 import datetime
 import os
 import sys
@@ -124,6 +125,12 @@ def __iter__(self):  # Backwards compatible with the old NamedTuple implementati
         return iter(attr.astuple(self))
 
 
+@dataclass
+class VersionedItemWithJoin:
+    versions: List[VersionedItem]
+    data: Any
+
+
 def _env_config_from_lib_config(lib_cfg, env):
     cfg = EnvironmentConfigsMap()
     e = cfg.env_by_id[env]
@@ -2094,22 +2101,42 @@ def _get_index_columns_from_descriptor(descriptor):
 
         return index_columns
 
-    def _adapt_read_res(self, read_result: ReadResult) -> VersionedItem:
+    def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, VersionedItemWithJoin]:
         frame_data = FrameData.from_cpp(read_result.frame_data)
+        # TODO: Return metadata for all symbol/version pairs as well
         meta = denormalize_user_metadata(read_result.udm, self._normalizer)
         data = self._normalizer.denormalize(frame_data, read_result.norm)
         if read_result.norm.HasField("custom"):
             data = self._custom_normalizer.denormalize(data, read_result.norm.custom)
 
-        return VersionedItem(
-            symbol=read_result.version.symbol,
-            library=self._library.library_path,
-            data=data,
-            version=read_result.version.version,
-            metadata=meta,
-            host=self.env,
-            timestamp=read_result.version.timestamp,
-        )
+        if isinstance(read_result.version, list):
+            versions = []
+            for idx in range(len(read_result.version)):
+                versions.append(
+                    VersionedItem(
+                        symbol=read_result.version[idx].symbol,
+                        library=self._library.library_path,
+                        data=None,
+                        version=read_result.version[idx].version,
+                        metadata=meta,
+                        host=self.env,
+                        timestamp=read_result.version[idx].timestamp,
+                    )
+                )
+            return VersionedItemWithJoin(
+                versions=versions,
+                data=data,
+            )
+        else:
+            return VersionedItem(
+                symbol=read_result.version.symbol,
+                library=self._library.library_path,
+                data=data,
+                version=read_result.version.version,
+                metadata=meta,
+                host=self.env,
+                timestamp=read_result.version.timestamp,
+            )
 
     def list_versions(
         self,
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index f8bca21629..854d53358a 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -20,7 +20,7 @@
 from arcticdb.util._versions import IS_PANDAS_TWO
 
 from arcticdb.version_store.processing import ExpressionNode, QueryBuilder
-from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput
+from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionedItemWithJoin, VersionQueryInput
 from arcticdb_ext.exceptions import ArcticException
 from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause
 import pandas as pd
@@ -488,7 +488,7 @@ def __init__(
         self._lazy_dataframes = lazy_dataframes
         self._join = join
 
-    def collect(self) -> VersionedItem:
+    def collect(self) -> VersionedItemWithJoin:
         if not len(self._lazy_dataframes._lazy_dataframes):
             return []
         else:
@@ -1679,7 +1679,7 @@ def _read_batch_with_join(
             read_requests: List[ReadRequest],
             join: Any,
             query_builder: Optional[QueryBuilder] = None,
-    ) -> VersionedItem:
+    ) -> VersionedItemWithJoin:
         symbol_strings = []
         as_ofs = []
         date_ranges = []
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index e909f1296a..df77e1a8cf 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -22,7 +22,7 @@
 @pytest.mark.parametrize("index", [None, pd.date_range("2025-01-01", periods=12)])
 def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per_segment, index):
     lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
-    df_1 = pd.DataFrame(
+    df_0 = pd.DataFrame(
         {
             "col1": np.arange(3, dtype=np.int64),
             "col2": np.arange(100, 103, dtype=np.int64),
@@ -30,7 +30,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
         },
         index=index[:3] if index is not None else None,
     )
-    df_2 = pd.DataFrame(
+    df_1 = pd.DataFrame(
         {
             "col1": np.arange(4, dtype=np.int64),
             "col2": np.arange(200, 204, dtype=np.int64),
@@ -38,7 +38,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
         },
         index=index[3:7] if index is not None else None,
     )
-    df_3 = pd.DataFrame(
+    df_2 = pd.DataFrame(
         {
             "col1": np.arange(5, dtype=np.int64),
             "col2": np.arange(300, 305, dtype=np.int64),
@@ -46,15 +46,19 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
         },
         index=index[7:] if index is not None else None,
     )
+    lib.write("sym0", df_0)
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
-    lib.write("sym3", df_3)
 
-    received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data
-    expected = pd.concat([df_1, df_2, df_3])
+    received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect()
+    expected = pd.concat([df_0, df_1, df_2])
     if index is None:
         expected.index = pd.RangeIndex(len(expected))
-    assert_frame_equal(expected, received)
+    assert_frame_equal(expected, received.data)
+    for idx, version in enumerate(received.versions):
+        assert version.symbol == f"sym{idx}"
+        assert version.version == 0
+        assert version.data is None
 
 
 # TODO: Get working with column slicing
@@ -64,14 +68,14 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
 @pytest.mark.parametrize("columns", [["col1"], ["col2"], ["col3"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]])
 def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, columns_per_segment, columns):
     lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
-    df_1 = pd.DataFrame(
+    df_0 = pd.DataFrame(
         {
             "col1": np.arange(3, dtype=np.int64),
             "col2": np.arange(100, 103, dtype=np.int64),
             "col3": np.arange(1000, 1003, dtype=np.int64),
         },
     )
-    df_2 = pd.DataFrame(
+    df_1 = pd.DataFrame(
         {
             "col0": np.arange(10, 14, dtype=np.int64),
             "col1": np.arange(4, dtype=np.int64),
@@ -79,14 +83,14 @@ def test_symbol_concat_column_slicing(lmdb_library_factory, rows_per_segment, co
             "col3": np.arange(2000, 2004, dtype=np.int64),
         },
     )
+    lib.write("sym0", df_0)
     lib.write("sym1", df_1)
-    lib.write("sym2", df_2)
 
+    lazy_df_0 = lib.read("sym0", columns=columns, lazy=True)
     lazy_df_1 = lib.read("sym1", columns=columns, lazy=True)
-    lazy_df_2 = lib.read("sym2", columns=columns, lazy=True)
 
-    received = concat([lazy_df_1, lazy_df_2]).collect().data
-    expected = pd.concat([df_1.loc[:, columns], df_2.loc[:, columns]])
+    received = concat([lazy_df_0, lazy_df_1]).collect().data
+    expected = pd.concat([df_0.loc[:, columns], df_1.loc[:, columns]])
     expected.index = pd.RangeIndex(len(expected))
     assert_frame_equal(expected, received)
 
@@ -103,39 +107,39 @@ def test_symbol_concat_multiindex(lmdb_library_factory, rows_per_segment, column
         },
         index=pd.MultiIndex.from_product([pd.date_range("2025-01-01", periods=4), [0, 1, 2]], names=["datetime", "level"]),
     )
-    lib.write("sym1", df[:3])
-    lib.write("sym2", df[3:7])
-    lib.write("sym3", df[7:])
+    lib.write("sym0", df[:3])
+    lib.write("sym1", df[3:7])
+    lib.write("sym2", df[7:])
 
-    received = concat(lib.read_batch(["sym1", "sym2", "sym3"], lazy=True)).collect().data
+    received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect().data
     assert_frame_equal(df, received)
 
 
 def test_symbol_concat_with_date_range(lmdb_library):
     lib = lmdb_library
-    df_1 = pd.DataFrame(
+    df_0 = pd.DataFrame(
         {
             "col1": np.arange(3, dtype=np.int64),
         },
         index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3),
     )
-    df_2 = pd.DataFrame(
+    df_1 = pd.DataFrame(
         {
             "col1": np.arange(4, dtype=np.int64),
         },
         index=pd.date_range(pd.Timestamp(1000), freq="1000ns", periods=4),
     )
+    lib.write("sym0", df_0)
     lib.write("sym1", df_1)
-    lib.write("sym2", df_2)
 
-    # Use date_range arg to trim last row from sym1
-    lazy_df_1 = lib.read("sym1", date_range=(None, pd.Timestamp(1000)), lazy=True)
-    # Use date_range clause to trim first row from sym2
-    lazy_df_2 = lib.read("sym2", lazy=True)
-    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(2000), None))
+    # Use date_range arg to trim last row from sym0
+    lazy_df_0 = lib.read("sym0", date_range=(None, pd.Timestamp(1000)), lazy=True)
+    # Use date_range clause to trim first row from sym1
+    lazy_df_1 = lib.read("sym1", lazy=True)
+    lazy_df_1 = lazy_df_1.date_range((pd.Timestamp(2000), None))
 
-    received = concat([lazy_df_1, lazy_df_2]).collect().data
-    expected = pd.concat([df_1[:2], df_2[1:]])
+    received = concat([lazy_df_0, lazy_df_1]).collect().data
+    expected = pd.concat([df_0[:2], df_1[1:]])
     assert_frame_equal(expected, received)
 
 
@@ -143,7 +147,7 @@ def test_symbol_concat_with_date_range(lmdb_library):
 @pytest.mark.parametrize("columns_per_segment", [2, 100_000])
 def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_per_segment):
     lib = lmdb_library_factory(LibraryOptions(rows_per_segment=rows_per_segment, columns_per_segment=columns_per_segment))
-    df_1 = pd.DataFrame(
+    df_0 = pd.DataFrame(
         {
         "col1": np.arange(3, dtype=np.int64),
         "col2": np.arange(100, 103, dtype=np.int64),
@@ -151,7 +155,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
         },
         index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3),
     )
-    df_2 = pd.DataFrame(
+    df_1 = pd.DataFrame(
         {
         "col1": np.arange(4, dtype=np.int64),
         "col2": np.arange(200, 204, dtype=np.int64),
@@ -159,7 +163,7 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
         },
         index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4),
     )
-    df_3 = pd.DataFrame(
+    df_2 = pd.DataFrame(
         {
         "col1": np.arange(5, dtype=np.int64),
         "col2": np.arange(300, 305, dtype=np.int64),
@@ -167,69 +171,69 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
         },
         index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5),
     )
+    lib.write("sym0", df_0)
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
-    lib.write("sym3", df_3)
 
+    lazy_df_0 = lib.read("sym0", lazy=True)
     lazy_df_1 = lib.read("sym1", lazy=True)
-    lazy_df_2 = lib.read("sym2", lazy=True)
-    lazy_df_2 = lazy_df_2.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
-    lazy_df_3 = lib.read("sym3", date_range=(None, pd.Timestamp(9000)), lazy=True)
+    lazy_df_1 = lazy_df_1.date_range((pd.Timestamp(pd.Timestamp(3000)), None))
+    lazy_df_2 = lib.read("sym2", date_range=(None, pd.Timestamp(9000)), lazy=True)
 
-    lazy_df = concat([lazy_df_1, lazy_df_2, lazy_df_3])
+    lazy_df = concat([lazy_df_0, lazy_df_1, lazy_df_2])
 
     lazy_df.resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
 
     received = lazy_df.collect().data
     received = received.reindex(columns=sorted(received.columns))
-    expected = pd.concat([df_1, df_2[1:], df_3[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
     assert_frame_equal(expected, received)
 
 
 @pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]])
 def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index):
     lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))
-    df_1 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index)
-    df_2 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index)
-    df_3 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index)
-    df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index)
-    df_5 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index)
+    df_0 = pd.DataFrame({"col1": [0], "col3": [0]}, index=index)
+    df_1 = pd.DataFrame({"col2": [0], "col3": [0]}, index=index)
+    df_2 = pd.DataFrame({"col1": [0], "col4": [0]}, index=index)
+    df_3 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col6": [0]}, index=index)
+    df_4 = pd.DataFrame({"col1": [0], "col3": [0], "col5": [0], "col7": [0]}, index=index)
+    lib.write("sym0", df_0)
     lib.write("sym1", df_1)
     lib.write("sym2", df_2)
     lib.write("sym3", df_3)
     lib.write("sym4", df_4)
-    lib.write("sym5", df_5)
 
     # First column different
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect()
+        concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect()
     # Second column different
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym1", "sym3"], lazy=True)).collect()
+        concat(lib.read_batch(["sym0", "sym2"], lazy=True)).collect()
     # First row slice with extra column slice
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym4", "sym1"], lazy=True)).collect()
+        concat(lib.read_batch(["sym3", "sym0"], lazy=True)).collect()
     # Second row slice with extra column slice
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym1", "sym4"], lazy=True)).collect()
+        concat(lib.read_batch(["sym0", "sym3"], lazy=True)).collect()
     # Row slices differ only in second column slice
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym4", "sym5"], lazy=True)).collect()
+        concat(lib.read_batch(["sym3", "sym4"], lazy=True)).collect()
 
 
 def test_symbol_concat_symbols_with_different_indexes(lmdb_library):
     lib = lmdb_library
-    df_1 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1))
-    df_2 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)])
+    df_0 = pd.DataFrame({"col": [0]}, index=pd.RangeIndex(1))
+    df_1 = pd.DataFrame({"col": [0]}, index=[pd.Timestamp(0)])
     dt1 = pd.Timestamp(0)
     dt2 = pd.Timestamp(1)
     arr1 = [dt1, dt1, dt2, dt2]
     arr2 = [0, 1, 0, 1]
-    df_3 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"]))
+    df_2 = pd.DataFrame({"col": [0]}, index=pd.MultiIndex.from_arrays([arr1, arr2], names=["datetime", "level"]))
 
-    lib.write("range_index_sym", df_1)
-    lib.write("timestamp_index_sym", df_2)
-    lib.write("multiindex_sym", df_3)
+    lib.write("range_index_sym", df_0)
+    lib.write("timestamp_index_sym", df_1)
+    lib.write("multiindex_sym", df_2)
 
     with pytest.raises(SchemaException):
         concat(lib.read_batch(["range_index_sym", "timestamp_index_sym"], lazy=True)).collect()
@@ -254,8 +258,8 @@ def test_symbol_concat_pickled_data(lmdb_library):
     lib = lmdb_library
     df = pd.DataFrame({"bytes": np.arange(10, dtype=np.uint64)})
     pickled_data = {"hi", "there"}
-    lib.write("sym1", df)
-    lib.write_pickle("sym2", pickled_data)
+    lib.write("sym0", df)
+    lib.write_pickle("sym1", pickled_data)
 
     with pytest.raises(SchemaException):
-        concat(lib.read_batch(["sym1", "sym2"], lazy=True)).collect()
+        concat(lib.read_batch(["sym0", "sym1"], lazy=True)).collect()

From 24b62e2246af87757959a27425c6a6a83575d19f Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Thu, 30 Jan 2025 12:22:13 +0000
Subject: [PATCH 33/36] Per-symbol metadatas returned to user

---
 cpp/arcticdb/entity/read_result.hpp              | 15 +++++++++++----
 cpp/arcticdb/python/adapt_read_dataframe.hpp     | 13 ++++++++++++-
 cpp/arcticdb/python/python_utils.hpp             |  2 +-
 cpp/arcticdb/version/local_versioned_engine.cpp  | 16 ++++++++++------
 cpp/arcticdb/version/version_core.hpp            |  7 ++++++-
 cpp/arcticdb/version/version_store_api.cpp       |  6 +++++-
 python/arcticdb/version_store/_store.py          |  6 ++----
 .../version_store/test_symbol_concatenation.py   |  5 +++--
 8 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/cpp/arcticdb/entity/read_result.hpp b/cpp/arcticdb/entity/read_result.hpp
index e05f2f3e41..33dfb64af7 100644
--- a/cpp/arcticdb/entity/read_result.hpp
+++ b/cpp/arcticdb/entity/read_result.hpp
@@ -24,7 +24,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
             const std::variant<VersionedItem, std::vector<VersionedItem>>& versioned_item,
             pipelines::PythonOutputFrame&& frame_data,
             const arcticdb::proto::descriptors::NormalizationMetadata& norm_meta,
-            const arcticdb::proto::descriptors::UserDefinedMetadata& user_meta,
+            const std::variant<arcticdb::proto::descriptors::UserDefinedMetadata, std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>& user_meta,
             const arcticdb::proto::descriptors::UserDefinedMetadata& multi_key_meta,
             std::vector<entity::AtomKey>&& multi_keys) :
             item(versioned_item),
@@ -38,7 +38,7 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
     std::variant<VersionedItem, std::vector<VersionedItem>> item;
     pipelines::PythonOutputFrame frame_data;
     arcticdb::proto::descriptors::NormalizationMetadata norm_meta;
-    arcticdb::proto::descriptors::UserDefinedMetadata user_meta;
+    std::variant<arcticdb::proto::descriptors::UserDefinedMetadata, std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>> user_meta;
     arcticdb::proto::descriptors::UserDefinedMetadata multi_key_meta;
     std::vector <entity::AtomKey> multi_keys;
 
@@ -47,7 +47,8 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
 
 inline ReadResult create_python_read_result(
     const std::variant<VersionedItem, std::vector<VersionedItem>>& version,
-    FrameAndDescriptor&& fd) {
+    FrameAndDescriptor&& fd,
+    std::optional<std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>&& user_meta = std::nullopt) {
     auto result = std::move(fd);
     // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored
     // (renamed from is_not_range_index) was false, the index was always RangeIndex(num_rows, 1)
@@ -73,7 +74,13 @@ inline ReadResult create_python_read_result(
     util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
 
     const auto& desc_proto = result.desc_.proto();
+    std::variant<arcticdb::proto::descriptors::UserDefinedMetadata, std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>> metadata;
+    if (user_meta.has_value()) {
+        metadata = *user_meta;
+    } else {
+        metadata = desc_proto.user_meta();
+    }
     return {version, std::move(python_frame), desc_proto.normalization(),
-            desc_proto.user_meta(), desc_proto.multi_key_meta(), std::move(result.keys_)};
+            metadata, desc_proto.multi_key_meta(), std::move(result.keys_)};
 }
 } //namespace arcticdb
\ No newline at end of file
diff --git a/cpp/arcticdb/python/adapt_read_dataframe.hpp b/cpp/arcticdb/python/adapt_read_dataframe.hpp
index 65f59b7658..b6594c3bfa 100644
--- a/cpp/arcticdb/python/adapt_read_dataframe.hpp
+++ b/cpp/arcticdb/python/adapt_read_dataframe.hpp
@@ -14,7 +14,18 @@ namespace arcticdb {
 
 inline auto adapt_read_df = [](ReadResult && ret) -> py::tuple{
     auto pynorm = python_util::pb_to_python(ret.norm_meta);
-    auto pyuser_meta = python_util::pb_to_python(ret.user_meta);
+    auto pyuser_meta = util::variant_match(
+            ret.user_meta,
+            [](const arcticdb::proto::descriptors::UserDefinedMetadata& metadata) -> py::object {
+                return python_util::pb_to_python(metadata);
+            },
+            [](const std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>& metadatas) -> py::object {
+                py::list py_metadatas;
+                for (const auto& metadata: metadatas) {
+                    py_metadatas.append(python_util::pb_to_python(metadata));
+                }
+                return py_metadatas;
+            });
     auto multi_key_meta = python_util::pb_to_python(ret.multi_key_meta);
     return py::make_tuple(ret.item, std::move(ret.frame_data), pynorm, pyuser_meta, multi_key_meta, ret.multi_keys);
 };
diff --git a/cpp/arcticdb/python/python_utils.hpp b/cpp/arcticdb/python/python_utils.hpp
index f24228b1cb..f9e803944c 100644
--- a/cpp/arcticdb/python/python_utils.hpp
+++ b/cpp/arcticdb/python/python_utils.hpp
@@ -252,7 +252,7 @@ inline py::list adapt_read_dfs(std::vector<std::variant<ReadResult, DataError>>&
             res,
             [&lst] (ReadResult& read_result) {
                 auto pynorm = python_util::pb_to_python(read_result.norm_meta);
-                auto pyuser_meta = python_util::pb_to_python(read_result.user_meta);
+                auto pyuser_meta = python_util::pb_to_python(std::get<arcticdb::proto::descriptors::UserDefinedMetadata>(read_result.user_meta));
                 auto multi_key_meta = python_util::pb_to_python(read_result.multi_key_meta);
                 lst.append(py::make_tuple(read_result.item, std::move(read_result.frame_data), pynorm, pyuser_meta, multi_key_meta,
                                           read_result.multi_keys));
diff --git a/cpp/arcticdb/version/local_versioned_engine.cpp b/cpp/arcticdb/version/local_versioned_engine.cpp
index 335c2adbe8..c109631f88 100644
--- a/cpp/arcticdb/version/local_versioned_engine.cpp
+++ b/cpp/arcticdb/version/local_versioned_engine.cpp
@@ -1209,8 +1209,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
     auto component_manager = std::make_shared<ComponentManager>();
     auto pipeline_context = std::make_shared<PipelineContext>();
     auto norm_meta_mtx = std::make_shared<std::mutex>();
-    // TODO: Generate this in a less hacky way
+    // TODO: Generate these in a less hacky way
     auto res_versioned_items = std::make_shared<std::vector<VersionedItem>>(stream_ids.size());
+    auto res_metadatas = std::make_shared<std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>(stream_ids.size());
     DecodePathData shared_data;
     for (auto&& [idx, opt_index_key_fut]: folly::enumerate(opt_index_key_futs)) {
         symbol_entities_futs.emplace_back(
@@ -1221,7 +1222,8 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                                                         &component_manager,
                                                         pipeline_context,
                                                         norm_meta_mtx,
-                                                        res_versioned_items](auto&& opt_index_key) mutable {
+                                                        res_versioned_items,
+                                                        res_metadatas](auto&& opt_index_key) mutable {
                     std::variant<VersionedItem, StreamId> version_info;
                     // TODO: Add support for symbols that only have incomplete segments
                     internal::check<ErrorCode::E_ASSERTION_FAILURE>(opt_index_key.has_value(), "batch_read_with_join_internal not supported with non-indexed data");
@@ -1232,8 +1234,9 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
                         // TODO: Construct this at the end of the pipeline, instead of reusing input data
                         pipeline_context->norm_meta_ = std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>(std::move(*index_key_seg.mutable_index_descriptor().mutable_proto().mutable_normalization()));
                     }
-                    // Shouldn't need mutex protecting as vector is presized and each element is only accessed once
-                    (*res_versioned_items)[idx] = VersionedItem(*opt_index_key);
+                    // Shouldn't need mutex protecting as vectors are presized and each element is only accessed once
+                    res_versioned_items->at(idx) = VersionedItem(*opt_index_key);
+                    res_metadatas->at(idx) = index_key_seg.mutable_index_descriptor().user_metadata();
                     version_info = VersionedItem(std::move(*opt_index_key));
                     return read_entity_ids_for_version(store, version_info, read_query, read_options, component_manager);
                 })
@@ -1253,10 +1256,11 @@ MultiSymbolReadOutput LocalVersionedEngine::batch_read_with_join_internal(
         // TODO: Make constructing the output descriptor a standard end of pipeline operation
         pipeline_context->set_descriptor(descriptor_from_segments(slice_and_keys));
         return prepare_output_frame(std::move(slice_and_keys), pipeline_context, store, ReadOptions{}, handler_data);
-    }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items](SegmentInMemory&& frame) mutable {
+    }).thenValueInline([&handler_data, pipeline_context, shared_data, res_versioned_items, res_metadatas](SegmentInMemory&& frame) mutable {
         return reduce_and_fix_columns(pipeline_context, frame, ReadOptions{}, handler_data)
-        .thenValue([pipeline_context, frame, shared_data, res_versioned_items](auto&&) mutable {
+        .thenValue([pipeline_context, frame, shared_data, res_versioned_items, res_metadatas](auto&&) mutable {
             return MultiSymbolReadOutput{std::move(*res_versioned_items),
+                                     std::move(*res_metadatas),
                                      {frame,
                                       timeseries_descriptor_from_pipeline_context(pipeline_context, {}, pipeline_context->bucketize_dynamic_),
                                       {},
diff --git a/cpp/arcticdb/version/version_core.hpp b/cpp/arcticdb/version/version_core.hpp
index 408d41acb7..296e56491b 100644
--- a/cpp/arcticdb/version/version_core.hpp
+++ b/cpp/arcticdb/version/version_core.hpp
@@ -55,13 +55,18 @@ struct ReadVersionOutput {
 
 struct MultiSymbolReadOutput {
     MultiSymbolReadOutput() = delete;
-    MultiSymbolReadOutput(std::vector<VersionedItem>&& versioned_items, FrameAndDescriptor&& frame_and_descriptor):
+    MultiSymbolReadOutput(
+            std::vector<VersionedItem>&& versioned_items,
+            std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>&& metadatas,
+            FrameAndDescriptor&& frame_and_descriptor):
             versioned_items_(std::move(versioned_items)),
+            metadatas_(std::move(metadatas)),
             frame_and_descriptor_(std::move(frame_and_descriptor)) {}
 
     ARCTICDB_MOVE_ONLY_DEFAULT(MultiSymbolReadOutput)
 
     std::vector<VersionedItem> versioned_items_;
+    std::vector<arcticdb::proto::descriptors::UserDefinedMetadata> metadatas_;
     FrameAndDescriptor frame_and_descriptor_;
 };
 
diff --git a/cpp/arcticdb/version/version_store_api.cpp b/cpp/arcticdb/version/version_store_api.cpp
index 283cfe7cba..6ed2ad7b96 100644
--- a/cpp/arcticdb/version/version_store_api.cpp
+++ b/cpp/arcticdb/version/version_store_api.cpp
@@ -795,7 +795,11 @@ ReadResult PythonVersionStore::batch_read_with_join(
         std::vector<std::shared_ptr<Clause>>&& clauses,
         std::any& handler_data) {
     auto versions_and_frame = batch_read_with_join_internal(stream_ids, version_queries, read_queries, read_options, std::move(clauses), handler_data);
-    return create_python_read_result(versions_and_frame.versioned_items_, std::move(versions_and_frame.frame_and_descriptor_));
+    return create_python_read_result(
+            versions_and_frame.versioned_items_,
+            std::move(versions_and_frame.frame_and_descriptor_),
+            std::move(versions_and_frame.metadatas_)
+            );
 }
 
 void PythonVersionStore::delete_snapshot(const SnapshotId& snap_name) {
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index 0f488a1343..d1ef308c22 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -2103,8 +2103,6 @@ def _get_index_columns_from_descriptor(descriptor):
 
     def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, VersionedItemWithJoin]:
         frame_data = FrameData.from_cpp(read_result.frame_data)
-        # TODO: Return metadata for all symbol/version pairs as well
-        meta = denormalize_user_metadata(read_result.udm, self._normalizer)
         data = self._normalizer.denormalize(frame_data, read_result.norm)
         if read_result.norm.HasField("custom"):
             data = self._custom_normalizer.denormalize(data, read_result.norm.custom)
@@ -2118,7 +2116,7 @@ def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, Versi
                         library=self._library.library_path,
                         data=None,
                         version=read_result.version[idx].version,
-                        metadata=meta,
+                        metadata=denormalize_user_metadata(read_result.udm[idx], self._normalizer),
                         host=self.env,
                         timestamp=read_result.version[idx].timestamp,
                     )
@@ -2133,7 +2131,7 @@ def _adapt_read_res(self, read_result: ReadResult) -> Union[VersionedItem, Versi
                 library=self._library.library_path,
                 data=data,
                 version=read_result.version.version,
-                metadata=meta,
+                metadata=denormalize_user_metadata(read_result.udm, self._normalizer),
                 host=self.env,
                 timestamp=read_result.version.timestamp,
             )
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index df77e1a8cf..6fed202941 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -46,9 +46,9 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
         },
         index=index[7:] if index is not None else None,
     )
-    lib.write("sym0", df_0)
+    lib.write("sym0", df_0, metadata=0)
     lib.write("sym1", df_1)
-    lib.write("sym2", df_2)
+    lib.write("sym2", df_2, metadata=2)
 
     received = concat(lib.read_batch(["sym0", "sym1", "sym2"], lazy=True)).collect()
     expected = pd.concat([df_0, df_1, df_2])
@@ -59,6 +59,7 @@ def test_symbol_concat_basic(lmdb_library_factory, rows_per_segment, columns_per
         assert version.symbol == f"sym{idx}"
         assert version.version == 0
         assert version.data is None
+        assert version.metadata == (None if idx == 1 else idx)
 
 
 # TODO: Get working with column slicing

From 98b261f6fb396d54789430fb0b1a143082485667 Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Thu, 30 Jan 2025 16:12:36 +0000
Subject: [PATCH 34/36] Refactor to not separate ConcatClause from other
 clauses

---
 cpp/arcticdb/version/python_bindings.cpp    | 37 ++++++++++++---------
 python/arcticdb/version_store/_store.py     |  4 +--
 python/arcticdb/version_store/library.py    | 14 ++++----
 python/arcticdb/version_store/processing.py | 14 ++++++++
 4 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
index f097678504..b8f4dd2317 100644
--- a/cpp/arcticdb/version/python_bindings.cpp
+++ b/cpp/arcticdb/version/python_bindings.cpp
@@ -769,27 +769,34 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
                  const std::vector<VersionQuery>& version_queries,
                  std::vector<std::shared_ptr<ReadQuery>>& read_queries,
                  const ReadOptions& read_options,
-                 ClauseVariant join,
-                 std::vector<ClauseVariant> post_join_clauses
+                 std::vector<ClauseVariant> clauses
                  ){
-                 auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
-                 post_join_clauses = plan_query(std::move(post_join_clauses));
+                 user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(!clauses.empty(), "batch_read_with_join called with no clauses");
+                 clauses = plan_query(std::move(clauses));
                  std::vector<std::shared_ptr<Clause>> _clauses;
-                 util::variant_match(
-                         join,
-                         [&](auto&& clause) {
-                             user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
-                                     clause->clause_info().multi_symbol_,
-                                     "Single-symbol clause cannot be used to join multiple symbols together");
-                             _clauses.emplace_back(std::make_shared<Clause>(*clause));
-                         }
-                 );
-                 for (auto&& clause: post_join_clauses) {
+                 bool first_clause{true};
+                 for (auto&& clause: clauses) {
                      util::variant_match(
                              clause,
-                             [&](auto&& clause) {_clauses.emplace_back(std::make_shared<Clause>(*clause));}
+                             [&](auto&& clause) {
+                                 if (first_clause) {
+                                     user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
+                                             clause->clause_info().multi_symbol_,
+                                             "Single-symbol clause cannot be used to join multiple symbols together");
+                                     _clauses.emplace_back(std::make_shared<Clause>(*clause));
+                                     first_clause = false;
+                                 } else {
+                                     // TODO: Add this check to ReadQuery.add_clauses
+                                     user_input::check<ErrorCode::E_INVALID_USER_ARGUMENT>(
+                                             !clause->clause_info().multi_symbol_,
+                                             "Multi-symbol clause cannot be used on a single symbol");
+                                     _clauses.emplace_back(std::make_shared<Clause>(*clause));
+                                 }
+                                 _clauses.emplace_back(std::make_shared<Clause>(*clause));
+                             }
                      );
                  }
+                 auto handler_data = TypeHandlerRegistry::instance()->get_handler_data();
                  return adapt_read_df(v.batch_read_with_join(stream_ids, version_queries, read_queries, read_options, std::move(_clauses), handler_data));
              },
              py::call_guard<SingleThreadMutexHolder>(), "Read a dataframe from the store")
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index d1ef308c22..5c5167b09d 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -1073,7 +1073,7 @@ def _batch_read_to_versioned_items(
         return versioned_items
 
     def _batch_read_with_join(
-            self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, join, query_builder
+            self, symbols, as_ofs, date_ranges, row_ranges, columns, per_symbol_query_builders, query_builder
     ):
         implement_read_index = True
         if columns:
@@ -1083,7 +1083,7 @@ def _batch_read_with_join(
         per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders)
         read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders)
         read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False)
-        return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, join, query_builder.clauses)))
+        return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, query_builder.clauses)))
 
     def batch_read_metadata(
         self, symbols: List[str], as_ofs: Optional[List[VersionQueryInput]] = None, **kwargs
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index 854d53358a..7b6e4753b6 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -22,7 +22,7 @@
 from arcticdb.version_store.processing import ExpressionNode, QueryBuilder
 from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionedItemWithJoin, VersionQueryInput
 from arcticdb_ext.exceptions import ArcticException
-from arcticdb_ext.version_store import DataError, ConcatClause as _ConcatClause
+from arcticdb_ext.version_store import DataError
 import pandas as pd
 import numpy as np
 import logging
@@ -482,22 +482,22 @@ class LazyDataFrameAfterJoin(QueryBuilder):
     def __init__(
             self,
             lazy_dataframes: LazyDataFrameCollection,
-            join: Any,
+            join: QueryBuilder,
     ):
         super().__init__()
         self._lazy_dataframes = lazy_dataframes
-        self._join = join
+        self.then(join)
 
     def collect(self) -> VersionedItemWithJoin:
         if not len(self._lazy_dataframes._lazy_dataframes):
             return []
         else:
             lib = self._lazy_dataframes._lib
-            return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self._join, self)
+            return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self)
 
     def __str__(self) -> str:
         query_builder_repr = super().__str__()
-        return f"LazyDataFrameAfterJoin({self._join}({self._lazy_dataframes._lazy_dataframes}){' | ' if len(query_builder_repr) else ''}{query_builder_repr})"
+        return f"LazyDataFrameAfterJoin({self._lazy_dataframes._lazy_dataframes} | {query_builder_repr})"
 
     def __repr__(self) -> str:
         return self.__str__()
@@ -506,7 +506,7 @@ def __repr__(self) -> str:
 def concat(lazy_dataframes: Union[List[LazyDataFrame], LazyDataFrameCollection]) -> LazyDataFrameAfterJoin:
     if not isinstance(lazy_dataframes, LazyDataFrameCollection):
         lazy_dataframes = LazyDataFrameCollection(lazy_dataframes)
-    return LazyDataFrameAfterJoin(lazy_dataframes, _ConcatClause())
+    return LazyDataFrameAfterJoin(lazy_dataframes, QueryBuilder().concat())
 
 
 def col(name: str) -> ExpressionNode:
@@ -1677,7 +1677,6 @@ def handle_symbol(s_):
     def _read_batch_with_join(
             self,
             read_requests: List[ReadRequest],
-            join: Any,
             query_builder: Optional[QueryBuilder] = None,
     ) -> VersionedItemWithJoin:
         symbol_strings = []
@@ -1702,7 +1701,6 @@ def _read_batch_with_join(
             row_ranges,
             columns,
             per_symbol_query_builders,
-            join,
             query_builder,
         )
 
diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py
index f58944376f..993e3d9929 100644
--- a/python/arcticdb/version_store/processing.py
+++ b/python/arcticdb/version_store/processing.py
@@ -33,6 +33,7 @@
 from arcticdb_ext.version_store import ResampleBoundary as _ResampleBoundary
 from arcticdb_ext.version_store import RowRangeClause as _RowRangeClause
 from arcticdb_ext.version_store import DateRangeClause as _DateRangeClause
+from arcticdb_ext.version_store import ConcatClause as _ConcatClause
 from arcticdb_ext.version_store import RowRangeType as _RowRangeType
 from arcticdb_ext.version_store import ExpressionName as _ExpressionName
 from arcticdb_ext.version_store import ColumnName as _ColumnName
@@ -323,6 +324,12 @@ class PythonResampleClause:
     origin: Union[str, pd.Timestamp] = "epoch"
 
 
+# TODO: Test pickling of this
+@dataclass
+class PythonConcatClause:
+    pass
+
+
 class QueryBuilder:
     """
     Build a query to process read results with. Syntax is designed to be similar to Pandas:
@@ -904,6 +911,11 @@ def date_range(self, date_range: DateRangeInput):
         self._python_clauses = self._python_clauses + [PythonDateRangeClause(start.value, end.value)]
         return self
 
+    def concat(self):
+        self.clauses = self.clauses + [_ConcatClause()]
+        self._python_clauses = self._python_clauses + [PythonConcatClause()]
+        return self
+
     def __eq__(self, right):
         if not isinstance(right, QueryBuilder):
             return False
@@ -965,6 +977,8 @@ def __setstate__(self, state):
                     self.clauses = self.clauses + [_RowRangeClause(python_clause.row_range_type, python_clause.n)]
             elif isinstance(python_clause, PythonDateRangeClause):
                 self.clauses = self.clauses + [_DateRangeClause(python_clause.start, python_clause.end)]
+            elif isinstance(python_clause, PythonConcatClause):
+                self.clauses = self.clauses + [_ConcatClause()]
             else:
                 raise ArcticNativeException(
                     f"Unrecognised clause type {type(python_clause)} when unpickling QueryBuilder"

From 7722ebbf246d128d708c3fecb0d2d5612fd8412b Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Thu, 30 Jan 2025 16:33:50 +0000
Subject: [PATCH 35/36] Add QueryBuilder syntax

---
 python/arcticdb/version_store/_store.py       |  8 +++-
 python/arcticdb/version_store/library.py      |  4 +-
 .../test_symbol_concatenation.py              | 43 +++++++++++++++++++
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index 5c5167b09d..777a610b1c 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -1081,7 +1081,9 @@ def _batch_read_with_join(
         version_queries = self._get_version_queries(len(symbols), as_ofs, iterate_snapshots_if_tombstoned=False)
         # Take a copy as _get_read_queries can modify the input argument, which makes reusing the input counter-intuitive
         per_symbol_query_builders = copy.deepcopy(per_symbol_query_builders)
-        read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders)
+        # TODO: Less hacky way for date ranges (and most likely row ranges) as part of ReadRequests to work
+        force_ranges_to_queries = True
+        read_queries = self._get_read_queries(len(symbols), date_ranges, row_ranges, columns, per_symbol_query_builders, force_ranges_to_queries)
         read_options = self._get_read_options(iterate_snapshots_if_tombstoned=False)
         return self._adapt_read_res(ReadResult(*self.version_store.batch_read_with_join(symbols, version_queries, read_queries, read_options, query_builder.clauses)))
 
@@ -1621,6 +1623,7 @@ def _get_read_queries(
         row_ranges: Optional[List[Optional[Tuple[int, int]]]],
         columns: Optional[List[List[str]]],
         query_builder: Optional[Union[QueryBuilder, List[QueryBuilder]]],
+        force_ranges_to_queries: bool = False,
     ):
         read_queries = []
 
@@ -1667,6 +1670,9 @@ def _get_read_queries(
             if query_builder is not None:
                 query = copy.deepcopy(query_builder) if isinstance(query_builder, QueryBuilder) else query_builder[idx]
 
+            if query is None and force_ranges_to_queries:
+                query = QueryBuilder()
+
             read_query = self._get_read_query(
                 date_range=date_range,
                 row_range=row_range,
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index 7b6e4753b6..c3378ea702 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -493,7 +493,7 @@ def collect(self) -> VersionedItemWithJoin:
             return []
         else:
             lib = self._lazy_dataframes._lib
-            return lib._read_batch_with_join(self._lazy_dataframes._read_requests(), self)
+            return lib.read_batch_with_join(self._lazy_dataframes._read_requests(), self)
 
     def __str__(self) -> str:
         query_builder_repr = super().__str__()
@@ -1674,7 +1674,7 @@ def handle_symbol(s_):
                 iterate_snapshots_if_tombstoned=False,
             )
 
-    def _read_batch_with_join(
+    def read_batch_with_join(
             self,
             read_requests: List[ReadRequest],
             query_builder: Optional[QueryBuilder] = None,
diff --git a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
index 6fed202941..7ec3954f00 100644
--- a/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
+++ b/python/tests/unit/arcticdb/version_store/test_symbol_concatenation.py
@@ -191,6 +191,49 @@ def test_symbol_concat_complex(lmdb_library_factory, rows_per_segment, columns_p
     assert_frame_equal(expected, received)
 
 
+def test_symbol_concat_querybuilder_syntax(lmdb_library):
+    lib = lmdb_library
+    df_0 = pd.DataFrame(
+        {
+            "col1": np.arange(3, dtype=np.int64),
+            "col2": np.arange(100, 103, dtype=np.int64),
+            "col3": np.arange(1000, 1003, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(0), freq="1000ns", periods=3),
+    )
+    df_1 = pd.DataFrame(
+        {
+            "col1": np.arange(4, dtype=np.int64),
+            "col2": np.arange(200, 204, dtype=np.int64),
+            "col3": np.arange(2000, 2004, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(2000), freq="1000ns", periods=4),
+    )
+    df_2 = pd.DataFrame(
+        {
+            "col1": np.arange(5, dtype=np.int64),
+            "col2": np.arange(300, 305, dtype=np.int64),
+            "col3": np.arange(3000, 3005, dtype=np.int64),
+        },
+        index=pd.date_range(pd.Timestamp(6000), freq="1000ns", periods=5),
+    )
+    lib.write("sym0", df_0)
+    lib.write("sym1", df_1)
+    lib.write("sym2", df_2)
+
+    read_request_0 = ReadRequest("sym0")
+    qb1 = QueryBuilder().date_range((pd.Timestamp(pd.Timestamp(3000)), None))
+    read_request_1 = ReadRequest("sym1", query_builder=qb1)
+    read_request_2 = ReadRequest("sym2", date_range=(None, pd.Timestamp(9000)))
+
+    q = QueryBuilder().concat().resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    received = lib.read_batch_with_join([read_request_0, read_request_1, read_request_2], query_builder=q).data
+
+    received = received.reindex(columns=sorted(received.columns))
+    expected = pd.concat([df_0, df_1[1:], df_2[:4]]).resample("2000ns").agg({"col1": "sum", "col2": "mean", "col3": "min"})
+    assert_frame_equal(expected, received)
+
+
 @pytest.mark.parametrize("index", [None, [pd.Timestamp(0)]])
 def test_symbol_concat_symbols_with_different_columns(lmdb_library_factory, index):
     lib = lmdb_library_factory(LibraryOptions(columns_per_segment=2))

From 831b36443bf468ffa069911509dd9121a7b0b4ab Mon Sep 17 00:00:00 2001
From: Alex Owens <alexowens90@gmail.com>
Date: Thu, 30 Jan 2025 17:03:43 +0000
Subject: [PATCH 36/36] Unpin threadpool core counts

---
 cpp/arcticdb/async/task_scheduler.hpp | 6 +++---
 cpp/arcticdb/processing/clause.hpp    | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/arcticdb/async/task_scheduler.hpp b/cpp/arcticdb/async/task_scheduler.hpp
index 38c86d6a66..087bd9975e 100644
--- a/cpp/arcticdb/async/task_scheduler.hpp
+++ b/cpp/arcticdb/async/task_scheduler.hpp
@@ -179,10 +179,10 @@ class TaskScheduler {
     using CPUSchedulerType = folly::FutureExecutor<folly::CPUThreadPoolExecutor>;
     using IOSchedulerType = folly::FutureExecutor<folly::IOThreadPoolExecutor>;
 
-     explicit TaskScheduler(ARCTICDB_UNUSED const std::optional<size_t>& cpu_thread_count = std::nullopt, ARCTICDB_UNUSED const std::optional<size_t>& io_thread_count = std::nullopt) :
+     explicit TaskScheduler(const std::optional<size_t>& cpu_thread_count = std::nullopt, const std::optional<size_t>& io_thread_count = std::nullopt) :
         cgroup_folder_("/sys/fs/cgroup"),
-        cpu_thread_count_(1),
-        io_thread_count_(1),
+        cpu_thread_count_(cpu_thread_count ? *cpu_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumCPUThreads", get_default_num_cpus(cgroup_folder_))),
+        io_thread_count_(io_thread_count ? *io_thread_count : ConfigsMap::instance()->get_int("VersionStore.NumIOThreads", (int) (cpu_thread_count_ * 1.5))),
         cpu_exec_(cpu_thread_count_, std::make_shared<InstrumentedNamedFactory>("CPUPool")) ,
         io_exec_(io_thread_count_,  std::make_shared<InstrumentedNamedFactory>("IOPool")){
         util::check(cpu_thread_count_ > 0 && io_thread_count_ > 0, "Zero IO or CPU threads: {} {}", io_thread_count_, cpu_thread_count_);
diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp
index 004cbedc0f..51f69b2640 100644
--- a/cpp/arcticdb/processing/clause.hpp
+++ b/cpp/arcticdb/processing/clause.hpp
@@ -679,6 +679,8 @@ struct ConcatClause {
     ARCTICDB_MOVE_COPY_DEFAULT(ConcatClause)
 
     [[nodiscard]] std::vector<std::vector<size_t>> structure_for_processing(ARCTICDB_UNUSED std::vector<RangesAndKey>& ranges_and_keys) {
+        // TODO: This isn't a long term requirement, for simplicity at the moment we force all input symbols through their own
+        // processing pipeline right now, even if they have no clauses
         internal::raise<ErrorCode::E_ASSERTION_FAILURE>("ConcatClause should never be first in the pipeline");
     }