From b869b2f579a21be7e87bf8b5112bbf83be1d8938 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Mon, 25 Nov 2024 21:16:01 +0800
Subject: [PATCH 01/24] new hash

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/HashTable/Hash.h   | 125 +++++++++++++++++++++++++++++
 dbms/src/Interpreters/Aggregator.h |  20 ++---
 2 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h
index b4f5d2c0a04..3f25f64bc74 100644
--- a/dbms/src/Common/HashTable/Hash.h
+++ b/dbms/src/Common/HashTable/Hash.h
@@ -416,3 +416,128 @@ struct IntHash32<T, salt, std::enable_if_t<!is_fit_register<T>, void>>
         }
     }
 };
+
+inline uint64_t umul128(uint64_t v, uint64_t kmul, uint64_t * high)
+{
+    DB::Int128 res = static_cast<DB::Int128>(v) * static_cast<DB::Int128>(kmul);
+    *high = static_cast<uint64_t>(res >> 64);
+    return static_cast<uint64_t>(res);
+}
+
+template <typename T>
+inline void hash_combine(uint64_t & seed, const T & val)
+{
+    // from: https://github.com/HowardHinnant/hash_append/issues/7#issuecomment-629414712
+    seed ^= std::hash<T>{}(val) + 0x9e3779b97f4a7c15LLU + (seed << 12) + (seed >> 4);
+}
+
+inline uint64_t hash_int128(uint64_t seed, const DB::Int128 & v)
+{
+    auto low = static_cast<uint64_t>(v);
+    auto high = static_cast<uint64_t>(v >> 64);
+    hash_combine(seed, low);
+    hash_combine(seed, high);
+    return seed;
+}
+
+inline uint64_t hash_uint128(uint64_t seed, const DB::UInt128 & v)
+{
+    hash_combine(seed, v.low);
+    hash_combine(seed, v.high);
+    return seed;
+}
+
+inline uint64_t hash_int256(uint64_t seed, const DB::Int256 & v)
+{
+    const auto & backend_value = v.backend();
+    for (size_t i = 0; i < backend_value.size(); ++i)
+    {
+        hash_combine(seed, backend_value.limbs()[i]);
+    }
+    return seed;
+}
+
+inline uint64_t hash_uint256(uint64_t seed, const DB::UInt256 & v)
+{
+    hash_combine(seed, v.a);
+    hash_combine(seed, v.b);
+    hash_combine(seed, v.c);
+    hash_combine(seed, v.d);
+    return seed;
+}
+
+template <size_t n>
+struct HashWithMixSeedHelper
+{
+    inline size_t operator()(size_t) const;
+};
+
+template <>
+struct HashWithMixSeedHelper<4>
+{
+    inline size_t operator()(size_t v) const
+    {
+        // from: https://github.com/aappleby/smhasher/blob/0ff96f7835817a27d0487325b6c16033e2992eb5/src/MurmurHash3.cpp#L102
+        static constexpr uint64_t kmul = 0xcc9e2d51UL;
+        uint64_t mul = v * kmul;
+        return static_cast<size_t>(mul ^ (mul >> 32u));
+    }
+};
+
+template <>
+struct HashWithMixSeedHelper<8>
+{
+    inline size_t operator()(size_t v) const
+    {
+        // from: https://github.com/martinus/robin-hood-hashing/blob/b21730713f4b5296bec411917c46919f7b38b178/src/include/robin_hood.h#L735
+        static constexpr uint64_t kmul = 0xde5fb9d2630458e9ULL;
+        uint64_t high = 0;
+        uint64_t low = umul128(v, kmul, &high);
+        return static_cast<size_t>(high + low);
+    }
+};
+
+template <typename T>
+struct HashWithMixSeed
+{
+    inline size_t operator()(const T & v) const
+    {
+        return HashWithMixSeedHelper<sizeof(size_t)>()(std::hash<T>()(v));
+    }
+};
+
+template <>
+struct HashWithMixSeed<DB::Int128>
+{
+    inline size_t operator()(const DB::Int128 & v) const
+    {
+        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_int128(0, v));
+    }
+};
+
+template <>
+struct HashWithMixSeed<DB::UInt128>
+{
+    inline size_t operator()(const DB::UInt128 & v) const
+    {
+        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_uint128(0, v));
+    }
+};
+
+template <>
+struct HashWithMixSeed<DB::Int256>
+{
+    inline size_t operator()(const DB::Int256 & v) const
+    {
+        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_int256(0, v));
+    }
+};
+
+template <>
+struct HashWithMixSeed<DB::UInt256>
+{
+    inline size_t operator()(const DB::UInt256 & v) const
+    {
+        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_uint256(0, v));
+    }
+};
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 381bfba8462..9515782793a 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -77,27 +77,27 @@ using AggregatedDataWithoutKey = AggregateDataPtr;
 using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
 using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
 
-using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
 
 using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
+using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
 
-using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
-using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,

From e8a2df81cb2bfc4a5eb3b2660f5b2aa4c5de4d97 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 26 Nov 2024 17:31:24 +0800
Subject: [PATCH 02/24] prefetch done

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashing.h              | 19 ++++
 dbms/src/Common/ColumnsHashingImpl.h          | 62 ++++++++++---
 dbms/src/Common/HashTable/FixedHashTable.h    |  3 +-
 dbms/src/Common/HashTable/Hash.h              | 28 +++---
 dbms/src/Common/HashTable/HashTable.h         | 11 +++
 dbms/src/Common/HashTable/SmallTable.h        |  1 +
 dbms/src/Common/HashTable/StringHashMap.h     |  9 +-
 dbms/src/Common/HashTable/StringHashTable.h   | 92 ++++++++++++-------
 dbms/src/Common/HashTable/TwoLevelHashTable.h | 13 +++
 .../HashTable/TwoLevelStringHashTable.h       | 53 +++++++++--
 dbms/src/Interpreters/Aggregator.cpp          | 63 +++++++++++--
 dbms/src/Interpreters/Aggregator.h            |  7 +-
 12 files changed, 275 insertions(+), 86 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index 398d6605e60..e14a793567c 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -49,14 +49,17 @@ struct HashMethodOneNumber
     using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
 
     const FieldType * vec;
+    const size_t total_rows;
 
     /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
     HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
+        : total_rows(key_columns[0]->size())
     {
         vec = &static_cast<const ColumnVector<FieldType> *>(key_columns[0])->getData()[0];
     }
 
     explicit HashMethodOneNumber(const IColumn * column)
+        : total_rows(column->size())
     {
         vec = &static_cast<const ColumnVector<FieldType> *>(column)->getData()[0];
     }
@@ -82,6 +85,8 @@ struct HashMethodOneNumber
     }
 
     const FieldType * getKeyData() const { return vec; }
+
+    size_t getTotalRows() const { return total_rows; }
 };
 
 
@@ -97,11 +102,13 @@ struct HashMethodString
     const IColumn::Offset * offsets;
     const UInt8 * chars;
     TiDB::TiDBCollatorPtr collator = nullptr;
+    const size_t total_rows;
 
     HashMethodString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
         const TiDB::TiDBCollators & collators)
+        : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
         const auto & column_string = assert_cast<const ColumnString &>(column);
@@ -149,8 +156,10 @@ struct HashMethodStringBin
 
     const IColumn::Offset * offsets;
     const UInt8 * chars;
+    const size_t total_rows;
 
     HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
+        : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
         const auto & column_string = assert_cast<const ColumnString &>(column);
@@ -346,10 +355,12 @@ struct HashMethodFastPathTwoKeysSerialized
 
     Key1Desc key_1_desc;
     Key2Desc key_2_desc;
+    const size_t total_rows;
 
     HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &)
         : key_1_desc(key_columns[0])
         , key_2_desc(key_columns[1])
+        , total_rows(key_columns[0]->size())
     {}
 
     ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
@@ -384,11 +395,13 @@ struct HashMethodFixedString
     size_t n;
     const ColumnFixedString::Chars_t * chars;
     TiDB::TiDBCollatorPtr collator = nullptr;
+    const size_t total_rows;
 
     HashMethodFixedString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
         const TiDB::TiDBCollators & collators)
+        : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
         const auto & column_string = assert_cast<const ColumnFixedString &>(column);
@@ -442,6 +455,7 @@ struct HashMethodKeysFixed
 
     Sizes key_sizes;
     size_t keys_size;
+    const size_t total_rows;
 
     /// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
 #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
@@ -467,6 +481,7 @@ struct HashMethodKeysFixed
         : Base(key_columns)
         , key_sizes(std::move(key_sizes_))
         , keys_size(key_columns.size())
+        , total_rows(key_columns[0]->size())
     {
         if (usePreparedKeys(key_sizes))
         {
@@ -596,6 +611,7 @@ struct HashMethodSerialized
     ColumnRawPtrs key_columns;
     size_t keys_size;
     TiDB::TiDBCollators collators;
+    const size_t total_rows;
 
     HashMethodSerialized(
         const ColumnRawPtrs & key_columns_,
@@ -604,6 +620,7 @@ struct HashMethodSerialized
         : key_columns(key_columns_)
         , keys_size(key_columns_.size())
         , collators(collators_)
+        , total_rows(key_columns_[0]->size())
     {}
 
     ALWAYS_INLINE inline SerializedKeyHolder getKeyHolder(
@@ -631,10 +648,12 @@ struct HashMethodHashed
 
     ColumnRawPtrs key_columns;
     TiDB::TiDBCollators collators;
+    const size_t total_rows;
 
     HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const TiDB::TiDBCollators & collators_)
         : key_columns(std::move(key_columns_))
         , collators(collators_)
+        , total_rows(key_columns[0]->size())
     {}
 
     ALWAYS_INLINE inline Key getKeyHolder(size_t row, Arena *, std::vector<String> & sort_key_containers) const
diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index d4f4143015d..24574ed40a4 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -127,27 +127,53 @@ class HashMethodBase
     using FindResult = FindResultImpl<Mapped>;
     static constexpr bool has_mapped = !std::is_same<Mapped, VoidMapped>::value;
     using Cache = LastElementCache<Value, consecutive_keys_optimization>;
+    static constexpr size_t prefetch_step = 16;
 
-    template <typename Data>
+    template <bool enable_prefetch = false, typename Data>
     ALWAYS_INLINE inline EmplaceResult emplaceKey(
         Data & data,
         size_t row,
         Arena & pool,
-        std::vector<String> & sort_key_containers)
+        std::vector<String> & sort_key_containers,
+        const std::vector<size_t> & hashvals = {})
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
-        return emplaceImpl(key_holder, data);
+        if constexpr (enable_prefetch)
+        {
+            const auto idx = row + prefetch_step;
+            if (idx < hashvals.size())
+                data.prefetch(hashvals[idx]);
+
+            return emplaceImpl<enable_prefetch>(key_holder, data, hashvals[row]);
+        }
+        else
+        {
+            return emplaceImpl<enable_prefetch>(key_holder, data, 0);
+        }
     }
 
-    template <typename Data>
+    template <bool enable_prefetch = false, typename Data>
     ALWAYS_INLINE inline FindResult findKey(
         Data & data,
         size_t row,
         Arena & pool,
-        std::vector<String> & sort_key_containers)
+        std::vector<String> & sort_key_containers,
+        const std::vector<size_t> & hashvals = {})
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
-        return findKeyImpl(keyHolderGetKey(key_holder), data);
+        if constexpr (enable_prefetch)
+        {
+            const auto idx = row + prefetch_step;
+            if (idx < hashvals.size())
+                data.prefetch(hashvals[idx]);
+
+            return findKeyImpl<enable_prefetch>(keyHolderGetKey(key_holder), data, hashvals[row]);
+        }
+        else
+        {
+            return findKeyImpl<enable_prefetch>(keyHolderGetKey(key_holder), data, 0);
+        }
+
     }
 
     template <typename Data>
@@ -155,9 +181,9 @@ class HashMethodBase
         const Data & data,
         size_t row,
         Arena & pool,
-        std::vector<String> & sort_key_containers)
+        std::vector<String> & sort_key_containers) const
     {
-        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
+        auto key_holder = static_cast<const Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         return data.hash(keyHolderGetKey(key_holder));
     }
 
@@ -179,8 +205,8 @@ class HashMethodBase
         }
     }
 
-    template <typename Data, typename KeyHolder>
-    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
+    template <bool enable_prefetch, typename Data, typename KeyHolder>
+    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
     {
         if constexpr (Cache::consecutive_keys_optimization)
         {
@@ -195,7 +221,11 @@ class HashMethodBase
 
         typename Data::LookupResult it;
         bool inserted = false;
-        data.emplace(key_holder, it, inserted);
+
+        if constexpr (enable_prefetch)
+            data.emplace(key_holder, it, inserted, hashval);
+        else
+            data.emplace(key_holder, it, inserted);
 
         [[maybe_unused]] Mapped * cached = nullptr;
         if constexpr (has_mapped)
@@ -232,8 +262,8 @@ class HashMethodBase
             return EmplaceResult(inserted);
     }
 
-    template <typename Data, typename Key>
-    ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data)
+    template <bool enable_prefetch, typename Data, typename Key>
+    ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data, size_t hashval)
     {
         if constexpr (Cache::consecutive_keys_optimization)
         {
@@ -246,7 +276,11 @@ class HashMethodBase
             }
         }
 
-        auto it = data.find(key);
+        typename Data::LookupResult it;
+        if constexpr (enable_prefetch)
+            it = data.find(key, hashval);
+        else
+            it = data.find(key);
 
         if constexpr (consecutive_keys_optimization)
         {
diff --git a/dbms/src/Common/HashTable/FixedHashTable.h b/dbms/src/Common/HashTable/FixedHashTable.h
index 259e90684fc..cfa562667dc 100644
--- a/dbms/src/Common/HashTable/FixedHashTable.h
+++ b/dbms/src/Common/HashTable/FixedHashTable.h
@@ -212,7 +212,6 @@ class FixedHashTable
         typename cell_type::CellExt cell;
     };
 
-
 public:
     using key_type = Key;
     using mapped_type = typename Cell::mapped_type;
@@ -352,6 +351,8 @@ class FixedHashTable
 
     iterator end() { return iterator(this, buf ? buf + NUM_CELLS : buf); }
 
+    inline void prefetch(size_t) {}
+
     /// The last parameter is unused but exists for compatibility with HashTable interface.
     void ALWAYS_INLINE emplace(const Key & x, LookupResult & it, bool & inserted, size_t /* hash */ = 0)
     {
diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h
index 3f25f64bc74..883ec8ab6ff 100644
--- a/dbms/src/Common/HashTable/Hash.h
+++ b/dbms/src/Common/HashTable/Hash.h
@@ -469,13 +469,13 @@ inline uint64_t hash_uint256(uint64_t seed, const DB::UInt256 & v)
 template <size_t n>
 struct HashWithMixSeedHelper
 {
-    inline size_t operator()(size_t) const;
+    static inline size_t operator()(size_t);
 };
 
 template <>
 struct HashWithMixSeedHelper<4>
 {
-    inline size_t operator()(size_t v) const
+    static inline size_t operator()(size_t v)
     {
         // from: https://github.com/aappleby/smhasher/blob/0ff96f7835817a27d0487325b6c16033e2992eb5/src/MurmurHash3.cpp#L102
         static constexpr uint64_t kmul = 0xcc9e2d51UL;
@@ -487,7 +487,7 @@ struct HashWithMixSeedHelper<4>
 template <>
 struct HashWithMixSeedHelper<8>
 {
-    inline size_t operator()(size_t v) const
+    static inline size_t operator()(size_t v)
     {
         // from: https://github.com/martinus/robin-hood-hashing/blob/b21730713f4b5296bec411917c46919f7b38b178/src/include/robin_hood.h#L735
         static constexpr uint64_t kmul = 0xde5fb9d2630458e9ULL;
@@ -500,44 +500,44 @@ struct HashWithMixSeedHelper<8>
 template <typename T>
 struct HashWithMixSeed
 {
-    inline size_t operator()(const T & v) const
+    static size_t operator()(const T & v)
     {
-        return HashWithMixSeedHelper<sizeof(size_t)>()(std::hash<T>()(v));
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(std::hash<T>()(v));
     }
 };
 
 template <>
 struct HashWithMixSeed<DB::Int128>
 {
-    inline size_t operator()(const DB::Int128 & v) const
+    static size_t operator()(const DB::Int128 & v)
     {
-        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_int128(0, v));
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_int128(0, v));
     }
 };
 
 template <>
 struct HashWithMixSeed<DB::UInt128>
 {
-    inline size_t operator()(const DB::UInt128 & v) const
+    static inline size_t operator()(const DB::UInt128 & v)
     {
-        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_uint128(0, v));
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_uint128(0, v));
     }
 };
 
 template <>
 struct HashWithMixSeed<DB::Int256>
 {
-    inline size_t operator()(const DB::Int256 & v) const
+    static inline size_t operator()(const DB::Int256 & v)
     {
-        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_int256(0, v));
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_int256(0, v));
     }
 };
 
 template <>
 struct HashWithMixSeed<DB::UInt256>
-{
-    inline size_t operator()(const DB::UInt256 & v) const
+{ 
+    static inline size_t operator()(const DB::UInt256 & v)
     {
-        return HashWithMixSeedHelper<sizeof(size_t)>()(hash_uint256(0, v));
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_uint256(0, v));
     }
 };
diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index a4f0fe3be03..4f037f60019 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -851,6 +851,17 @@ class HashTable
 
     iterator end() { return iterator(this, buf ? buf + grower.bufSize() : buf); }
 
+    void ALWAYS_INLINE prefetch(size_t hashval) const
+    {
+        (void)hashval;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+        size_t place_value = grower.place(hashval);
+        __mm_prefetch((const char*)(&buf[place_value]), _MM_HINT_NTA);
+#elif defined(__GNUC__)
+        size_t place_value = grower.place(hashval);
+        __builtin_prefetch(static_cast<const void *>(&buf[place_value]));
+#endif
+    }
 
 protected:
     const_iterator iteratorTo(const Cell * ptr) const { return const_iterator(this, ptr); }
diff --git a/dbms/src/Common/HashTable/SmallTable.h b/dbms/src/Common/HashTable/SmallTable.h
index fa40b479430..a032ae76cff 100644
--- a/dbms/src/Common/HashTable/SmallTable.h
+++ b/dbms/src/Common/HashTable/SmallTable.h
@@ -296,6 +296,7 @@ class SmallTable
     iterator ALWAYS_INLINE find(Key x) { return iteratorTo(findCell(x)); }
     const_iterator ALWAYS_INLINE find(Key x) const { return iteratorTo(findCell(x)); }
 
+    void ALWAYS_INLINE prefetch(size_t) {}
 
     void write(DB::WriteBuffer & wb) const
     {
diff --git a/dbms/src/Common/HashTable/StringHashMap.h b/dbms/src/Common/HashTable/StringHashMap.h
index 6f7e668e1d9..cad653907fa 100644
--- a/dbms/src/Common/HashTable/StringHashMap.h
+++ b/dbms/src/Common/HashTable/StringHashMap.h
@@ -90,29 +90,30 @@ struct StringHashMapCell<StringRef, TMapped>
 template <typename TMapped, typename Allocator>
 struct StringHashMapSubMaps
 {
+    using Hash = StringHashTableHash;
     using T0 = StringHashTableEmpty<StringHashMapCell<StringRef, TMapped>>;
     using T1 = HashMapTable<
         StringKey8,
         StringHashMapCell<StringKey8, TMapped>,
-        StringHashTableHash,
+        Hash,
         StringHashTableGrower<>,
         Allocator>;
     using T2 = HashMapTable<
         StringKey16,
         StringHashMapCell<StringKey16, TMapped>,
-        StringHashTableHash,
+        Hash,
         StringHashTableGrower<>,
         Allocator>;
     using T3 = HashMapTable<
         StringKey24,
         StringHashMapCell<StringKey24, TMapped>,
-        StringHashTableHash,
+        Hash,
         StringHashTableGrower<>,
         Allocator>;
     using Ts = HashMapTable<
         StringRef,
         StringHashMapCell<StringRef, TMapped>,
-        StringHashTableHash,
+        Hash,
         StringHashTableGrower<>,
         Allocator>;
 };
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index aa4825f171a..e11972d0795 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -20,7 +20,6 @@
 #include <new>
 #include <variant>
 
-
 using StringKey8 = UInt64;
 using StringKey16 = DB::UInt128;
 struct StringKey24
@@ -48,45 +47,38 @@ inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
     return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
 }
 
-struct StringHashTableHash
+inline size_t hash_string_key_24(uint64_t seed, const StringKey24 & v)
 {
-#if defined(__SSE4_2__)
-    size_t ALWAYS_INLINE operator()(StringKey8 key) const
-    {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key);
-        return res;
-    }
-    size_t ALWAYS_INLINE operator()(const StringKey16 & key) const
-    {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key.low);
-        res = _mm_crc32_u64(res, key.high);
-        return res;
-    }
-    size_t ALWAYS_INLINE operator()(const StringKey24 & key) const
+    hash_combine(seed, v.a);
+    hash_combine(seed, v.b);
+    hash_combine(seed, v.c);
+    return seed;
+}
+
+template <>
+struct HashWithMixSeed<StringKey24>
+{
+    static inline size_t operator()(const StringKey24 & v)
     {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key.a);
-        res = _mm_crc32_u64(res, key.b);
-        res = _mm_crc32_u64(res, key.c);
-        return res;
+        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_string_key_24(0, v));
     }
-#else
-    size_t ALWAYS_INLINE operator()(StringKey8 key) const
+};
+
+struct StringHashTableHash
+{
+    static size_t ALWAYS_INLINE operator()(StringKey8 key)
     {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
+        return HashWithMixSeed<StringKey8>::operator()(key);
     }
-    size_t ALWAYS_INLINE operator()(const StringKey16 & key) const
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
     {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
+        return HashWithMixSeed<StringKey16>::operator()(key);
     }
-    size_t ALWAYS_INLINE operator()(const StringKey24 & key) const
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
     {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
+        return HashWithMixSeed<StringKey24>::operator()(key);
     }
-#endif
-    size_t ALWAYS_INLINE operator()(StringRef key) const { return StringRefHash()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHash()(key); }
 };
 
 template <typename Cell>
@@ -150,6 +142,8 @@ struct StringHashTableEmpty //-V730
         return hasZero() ? zeroValue() : nullptr;
     }
 
+    void ALWAYS_INLINE prefetch(size_t) {}
+
     void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
     void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
     void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
@@ -157,6 +151,7 @@ struct StringHashTableEmpty //-V730
     size_t size() const { return hasZero() ? 1 : 0; }
     bool empty() const { return !hasZero(); }
     size_t getBufferSizeInBytes() const { return sizeof(Cell); }
+    size_t getBufferSizeInCells() const { return 1; }
     void setResizeCallback(const ResizeCallback &) {}
     size_t getCollisions() const { return 0; }
 };
@@ -364,6 +359,13 @@ class StringHashTable : private boost::noncopyable
         this->dispatch(*this, key_holder, EmplaceCallable(it, inserted));
     }
 
+    // TODO del
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult &, bool &, size_t)
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::emplace instead");
+    }
+
     struct FindCallable
     {
         // find() doesn't need any key memory management, so we don't work with
@@ -380,12 +382,35 @@ class StringHashTable : private boost::noncopyable
         }
     };
 
+    // We will not prefetch StringHashTable directly, instead caller should call specific submap's prefetch.
+    // Because StringHashTable doesn't know which submap to prefetch.
+    void prefetch(size_t) const
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::prefetch instead");
+    }
+
     LookupResult ALWAYS_INLINE find(const Key & x) { return dispatch(*this, x, FindCallable{}); }
 
     ConstLookupResult ALWAYS_INLINE find(const Key & x) const { return dispatch(*this, x, FindCallable{}); }
 
+    // TODO del
+    LookupResult ALWAYS_INLINE find(const Key &, size_t)
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
+    }
+    ConstLookupResult ALWAYS_INLINE find(const Key &, size_t) const
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
+    }
+
     bool ALWAYS_INLINE has(const Key & x, size_t = 0) const { return dispatch(*this, x, FindCallable{}) != nullptr; }
 
+    template <typename HashKeyType>
+    size_t ALWAYS_INLINE hash(const HashKeyType & key) const
+    {
+        return SubMaps::Hash::operator()(key);
+    }
+
     void write(DB::WriteBuffer & wb) const
     {
         m0.write(wb);
@@ -434,6 +459,11 @@ class StringHashTable : private boost::noncopyable
 
     bool empty() const { return m0.empty() && m1.empty() && m2.empty() && m3.empty() && ms.empty(); }
 
+    size_t getBufferSizeInCells() const
+    {
+        return m0.getBufferSizeInCells() + m1.getBufferSizeInCells() + m2.getBufferSizeInCells()
+            + m3.getBufferSizeInCells() + ms.getBufferSizeInCells();
+    }
     size_t getBufferSizeInBytes() const
     {
         return m0.getBufferSizeInBytes() + m1.getBufferSizeInBytes() + m2.getBufferSizeInBytes()
diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h
index 6778cd4a3e8..01c14dd07c2 100644
--- a/dbms/src/Common/HashTable/TwoLevelHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h
@@ -285,6 +285,12 @@ class TwoLevelHashTable : private boost::noncopyable
         impls[buck].emplace(key_holder, it, inserted, hash_value);
     }
 
+    void ALWAYS_INLINE prefetch(size_t hashval) const
+    {
+        size_t buck = getBucketFromHash(hashval);
+        impls[buck].prefetch(hashval);
+    }
+
     LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
     {
         size_t buck = getBucketFromHash(hash_value);
@@ -352,6 +358,13 @@ class TwoLevelHashTable : private boost::noncopyable
         return true;
     }
 
+    size_t getBufferSizeInCells() const
+    {
+        size_t res = 0;
+        for (const auto & impl : impls)
+            res += impl.getBufferSizeInCells();
+        return res;
+    }
     size_t getBufferSizeInBytes() const
     {
         size_t res = 0;
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 5bdb24a3d13..5608d0fd0f8 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -30,8 +30,20 @@ class TwoLevelStringHashTable : private boost::noncopyable
     static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
     static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
 
+    template <typename HashKeyType>
+    size_t ALWAYS_INLINE hash(const HashKeyType & key) const
+    {
+        return SubMaps::Hash::operator()(key);
+    }
+
+    // Same reason as StringHashTable::prefetch.
+    void prefetch(size_t) const
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::prefetch instead");
+    }
+
     // TODO: currently hashing contains redundant computations when doing distributed or external aggregations
-    size_t hash(const Key & x) const
+    size_t hashStringRef(const Key & x) const
     {
         return const_cast<Self &>(*this).dispatch(*this, x, [&](const auto &, const auto &, size_t hash) {
             return hash;
@@ -44,7 +56,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
             impl.setResizeCallback(resize_callback);
     }
 
-    size_t operator()(const Key & x) const { return hash(x); }
+    size_t operator()(const Key & x) const { return hashStringRef(x); }
 
     /// NOTE Bad for hash tables with more than 2^32 cells.
     static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
@@ -104,7 +116,6 @@ class TwoLevelStringHashTable : private boost::noncopyable
 #endif
         dispatch(Self & self, KeyHolder && key_holder, Func && func)
     {
-        StringHashTableHash hash;
         const StringRef & x = keyHolderGetKey(key_holder);
         const size_t sz = x.size;
         if (sz == 0)
@@ -117,7 +128,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
         {
             // Strings with trailing zeros are not representable as fixed-size
             // string keys. Put them to the generic table.
-            auto res = hash(x);
+            auto res = SubMaps::Hash::operator()(x);
             auto buck = getBucketFromHash(res);
             return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
         }
@@ -154,7 +165,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 else
                     n[0] <<= s;
             }
-            auto res = hash(k8);
+            auto res = SubMaps::Hash::operator()(k8);
             auto buck = getBucketFromHash(res);
             keyHolderDiscardKey(key_holder);
             return func(self.impls[buck].m1, k8, res);
@@ -168,7 +179,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 n[1] >>= s;
             else
                 n[1] <<= s;
-            auto res = hash(k16);
+            auto res = SubMaps::Hash::operator()(k16);
             auto buck = getBucketFromHash(res);
             keyHolderDiscardKey(key_holder);
             return func(self.impls[buck].m2, k16, res);
@@ -182,14 +193,14 @@ class TwoLevelStringHashTable : private boost::noncopyable
                 n[2] >>= s;
             else
                 n[2] <<= s;
-            auto res = hash(k24);
+            auto res = SubMaps::Hash::operator()(k24);
             auto buck = getBucketFromHash(res);
             keyHolderDiscardKey(key_holder);
             return func(self.impls[buck].m3, k24, res);
         }
         default:
         {
-            auto res = hash(x);
+            auto res = SubMaps::Hash::operator()(x);
             auto buck = getBucketFromHash(res);
             return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
         }
@@ -202,12 +213,27 @@ class TwoLevelStringHashTable : private boost::noncopyable
         dispatch(*this, key_holder, typename Impl::EmplaceCallable{it, inserted});
     }
 
-    LookupResult ALWAYS_INLINE find(const Key x) { return dispatch(*this, x, typename Impl::FindCallable{}); }
+    template <typename KeyHolder>
+    void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult &, bool &, size_t)
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::emplace instead");
+    }
+
+    LookupResult ALWAYS_INLINE find(const Key & x) { return dispatch(*this, x, typename Impl::FindCallable{}); }
 
-    ConstLookupResult ALWAYS_INLINE find(const Key x) const
+    ConstLookupResult ALWAYS_INLINE find(const Key & x) const
     {
         return dispatch(*this, x, typename Impl::FindCallable{});
     }
+    LookupResult ALWAYS_INLINE find(const Key &, size_t)
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
+    }
+
+    ConstLookupResult ALWAYS_INLINE find(const Key &, size_t) const
+    {
+        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
+    }
 
     void write(DB::WriteBuffer & wb) const
     {
@@ -259,6 +285,13 @@ class TwoLevelStringHashTable : private boost::noncopyable
         return true;
     }
 
+    size_t getBufferSizeInCells() const
+    {
+        size_t res = 0;
+        for (const auto & impl : impls)
+            res = impl.getBufferSizeInCells();
+        return res;
+    }
     size_t getBufferSizeInBytes() const
     {
         size_t res = 0;
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index f25c22717e8..180799bd7ed 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -665,23 +665,43 @@ void NO_INLINE Aggregator::executeImpl(
 {
     typename Method::State state(agg_process_info.key_columns, key_sizes, collators);
 
-    executeImplBatch<collect_hit_rate, only_lookup>(method, state, aggregates_pool, agg_process_info);
+    if (method.data.getBufferSizeInCells() < 8192)
+        executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+    else
+        executeImplBatch<collect_hit_rate, only_lookup, true>(method, state, aggregates_pool, agg_process_info);
+}
+
+template <typename Data, typename State>
+std::vector<size_t> getHashVals(size_t start_row, size_t end_row, const Data & data, const State & state,
+        std::vector<String> & sort_key_containers, Arena * pool)
+{
+    std::vector<size_t> hashvals(state.total_rows, 0);
+    for (size_t i = start_row; i < end_row; ++i)
+    {
+        hashvals[i] = state.getHash(data, i, *pool, sort_key_containers);
+    }
+    return hashvals;
 }
 
-template <bool only_lookup, typename Method>
+template <bool only_lookup, bool enable_prefetch, typename Method>
 std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> Aggregator::emplaceOrFindKey(
     Method & method,
     typename Method::State & state,
     size_t index,
     Arena & aggregates_pool,
-    std::vector<std::string> & sort_key_containers) const
+    std::vector<std::string> & sort_key_containers,
+    const std::vector<size_t> & hashvals) const
 {
     try
     {
         if constexpr (only_lookup)
-            return state.findKey(method.data, index, aggregates_pool, sort_key_containers);
+        {
+            return state.template findKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
+        }
         else
-            return state.emplaceKey(method.data, index, aggregates_pool, sort_key_containers);
+        {
+            return state.template emplaceKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
+        }
     }
     catch (ResizeException &)
     {
@@ -689,7 +709,7 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     }
 }
 
-template <bool collect_hit_rate, bool only_lookup, typename Method>
+template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
 ALWAYS_INLINE void Aggregator::executeImplBatch(
     Method & method,
     typename Method::State & state,
@@ -712,14 +732,28 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     {
         /// For all rows.
         AggregateDataPtr place = aggregates_pool->alloc(0);
+        std::vector<size_t> hashvals;
+        if constexpr (enable_prefetch)
+        {
+            hashvals = getHashVals(
+                    agg_process_info.start_row,
+                    agg_process_info.end_row,
+                    method.data,
+                    state,
+                    sort_key_containers,
+                    aggregates_pool);
+
+        }
+
         for (size_t i = 0; i < agg_size; ++i)
         {
-            auto emplace_result_hold = emplaceOrFindKey<only_lookup>(
+            auto emplace_result_hold = emplaceOrFindKey<only_lookup, enable_prefetch>(
                 method,
                 state,
                 agg_process_info.start_row,
                 *aggregates_pool,
-                sort_key_containers);
+                sort_key_containers,
+                hashvals);
             if likely (emplace_result_hold.has_value())
             {
                 if constexpr (collect_hit_rate)
@@ -784,13 +818,24 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
 
     std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[agg_size]);
     std::optional<size_t> processed_rows;
+    std::vector<size_t> hashvals;
+    if constexpr (enable_prefetch)
+    {
+        hashvals = getHashVals(
+                agg_process_info.start_row,
+                agg_process_info.end_row,
+                method.data,
+                state,
+                sort_key_containers,
+                aggregates_pool);
+    }
 
     for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + agg_size; ++i)
     {
         AggregateDataPtr aggregate_data = nullptr;
 
         auto emplace_result_holder
-            = emplaceOrFindKey<only_lookup>(method, state, i, *aggregates_pool, sort_key_containers);
+            = emplaceOrFindKey<only_lookup, enable_prefetch>(method, state, i, *aggregates_pool, sort_key_containers, hashvals);
         if unlikely (!emplace_result_holder.has_value())
         {
             LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 9515782793a..0f1365694ac 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1454,20 +1454,21 @@ class Aggregator
         AggProcessInfo & agg_process_info,
         TiDB::TiDBCollators & collators) const;
 
-    template <bool collect_hit_rate, bool only_loopup, typename Method>
+    template <bool collect_hit_rate, bool only_loopup, bool enable_prefetch, typename Method>
     void executeImplBatch(
         Method & method,
         typename Method::State & state,
         Arena * aggregates_pool,
         AggProcessInfo & agg_process_info) const;
 
-    template <bool only_lookup, typename Method>
+    template <bool only_lookup, bool enable_prefetch, typename Method>
     std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> emplaceOrFindKey(
         Method & method,
         typename Method::State & state,
         size_t index,
         Arena & aggregates_pool,
-        std::vector<std::string> & sort_key_containers) const;
+        std::vector<std::string> & sort_key_containers,
+        const std::vector<size_t> & hashvals) const;
 
     /// For case when there are no keys (all aggregate into one row).
     static void executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena);

From b3141662d11d44633e2fa9ac8dbf040b674718b5 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Wed, 27 Nov 2024 10:57:36 +0800
Subject: [PATCH 03/24] executeImplBatchStringHashMap done

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h          |  54 +++-
 dbms/src/Common/HashTable/FixedHashTable.h    |   2 +
 dbms/src/Common/HashTable/HashTable.h         |   3 +
 dbms/src/Common/HashTable/SmallTable.h        |   3 +
 dbms/src/Common/HashTable/StringHashTable.h   | 164 +++++++++++-
 dbms/src/Common/HashTable/TwoLevelHashTable.h |   3 +
 .../HashTable/TwoLevelStringHashTable.h       |  66 +++++
 dbms/src/Interpreters/Aggregator.cpp          | 237 ++++++++++++++++--
 dbms/src/Interpreters/Aggregator.h            |  17 ++
 libs/libcommon/include/common/StringRef.h     |   2 +-
 10 files changed, 524 insertions(+), 27 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index 24574ed40a4..0c8d0bc1a49 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -16,6 +16,7 @@
 
 #include <Columns/IColumn.h>
 #include <Common/HashTable/HashTable.h>
+#include <Common/HashTable/StringHashTable.h>
 #include <Common/HashTable/HashTableKeyHolder.h>
 #include <Common/assert_cast.h>
 #include <Functions/FunctionHelpers.h>
@@ -144,11 +145,11 @@ class HashMethodBase
             if (idx < hashvals.size())
                 data.prefetch(hashvals[idx]);
 
-            return emplaceImpl<enable_prefetch>(key_holder, data, hashvals[row]);
+            return emplaceImpl<true>(key_holder, data, hashvals[row]);
         }
         else
         {
-            return emplaceImpl<enable_prefetch>(key_holder, data, 0);
+            return emplaceImpl<false>(key_holder, data, 0);
         }
     }
 
@@ -167,15 +168,52 @@ class HashMethodBase
             if (idx < hashvals.size())
                 data.prefetch(hashvals[idx]);
 
-            return findKeyImpl<enable_prefetch>(keyHolderGetKey(key_holder), data, hashvals[row]);
+            return findKeyImpl<true>(keyHolderGetKey(key_holder), data, hashvals[row]);
         }
         else
         {
-            return findKeyImpl<enable_prefetch>(keyHolderGetKey(key_holder), data, 0);
+            return findKeyImpl<false>(keyHolderGetKey(key_holder), data, 0);
         }
 
     }
 
+    template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
+    ALWAYS_INLINE inline EmplaceResult emplaceStringKey(
+            Data & data,
+            size_t idx,
+            const std::vector<StringKeyType> & datas,
+            const std::vector<size_t> & hashvals)
+    {
+        auto & submap = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(data);
+        if constexpr (enable_prefetch)
+        {
+            const auto prefetch_idx = idx + prefetch_step;
+            if (prefetch_idx < hashvals.size())
+                submap.prefetch(hashvals[prefetch_idx]);
+        }
+
+        return emplaceImpl<true>(datas[idx], submap, hashvals[idx]);
+    }
+
+    // TODO Macro with emplaceStringKey
+    template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
+    ALWAYS_INLINE inline FindResult findStringKey(
+            Data & data,
+            size_t idx,
+            const std::vector<StringKeyType> & datas,
+            const std::vector<size_t> & hashvals)
+    {
+        auto & submap = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(data);
+        if constexpr (enable_prefetch)
+        {
+            const auto prefetch_idx = idx + prefetch_step;
+            if (prefetch_idx < hashvals.size())
+                submap.prefetch(hashvals[prefetch_idx]);
+        }
+
+        return findKeyImpl<true>(datas[idx], submap, hashvals[idx]);
+    }
+
     template <typename Data>
     ALWAYS_INLINE inline size_t getHash(
         const Data & data,
@@ -205,7 +243,7 @@ class HashMethodBase
         }
     }
 
-    template <bool enable_prefetch, typename Data, typename KeyHolder>
+    template <bool use_hashval, typename Data, typename KeyHolder>
     ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
     {
         if constexpr (Cache::consecutive_keys_optimization)
@@ -222,7 +260,7 @@ class HashMethodBase
         typename Data::LookupResult it;
         bool inserted = false;
 
-        if constexpr (enable_prefetch)
+        if constexpr (use_hashval)
             data.emplace(key_holder, it, inserted, hashval);
         else
             data.emplace(key_holder, it, inserted);
@@ -262,7 +300,7 @@ class HashMethodBase
             return EmplaceResult(inserted);
     }
 
-    template <bool enable_prefetch, typename Data, typename Key>
+    template <bool use_hashval, typename Data, typename Key>
     ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data, size_t hashval)
     {
         if constexpr (Cache::consecutive_keys_optimization)
@@ -277,7 +315,7 @@ class HashMethodBase
         }
 
         typename Data::LookupResult it;
-        if constexpr (enable_prefetch)
+        if constexpr (use_hashval)
             it = data.find(key, hashval);
         else
             it = data.find(key);
diff --git a/dbms/src/Common/HashTable/FixedHashTable.h b/dbms/src/Common/HashTable/FixedHashTable.h
index cfa562667dc..8b0b721aa8c 100644
--- a/dbms/src/Common/HashTable/FixedHashTable.h
+++ b/dbms/src/Common/HashTable/FixedHashTable.h
@@ -221,6 +221,8 @@ class FixedHashTable
     using LookupResult = Cell *;
     using ConstLookupResult = const Cell *;
 
+    static constexpr bool is_string_hash_map = false;
+    static constexpr bool is_two_level = false;
 
     size_t hash(const Key & x) const { return x; }
 
diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index 4f037f60019..12ebc49756c 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -402,6 +402,9 @@ class HashTable
     using Grower = GrowerType;
     using Allocator = AllocatorType;
 
+    static constexpr bool is_string_hash_map = false;
+    static constexpr bool is_two_level = false;
+
 protected:
     friend class const_iterator;
     friend class iterator;
diff --git a/dbms/src/Common/HashTable/SmallTable.h b/dbms/src/Common/HashTable/SmallTable.h
index a032ae76cff..1292a4205da 100644
--- a/dbms/src/Common/HashTable/SmallTable.h
+++ b/dbms/src/Common/HashTable/SmallTable.h
@@ -85,6 +85,9 @@ class SmallTable
     using value_type = typename Cell::value_type;
     using cell_type = Cell;
 
+    static constexpr bool is_string_hash_map = false;
+    static constexpr bool is_two_level = false;
+
     class Reader final : private Cell::State
     {
     public:
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index e11972d0795..f906b043a9e 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -16,6 +16,7 @@
 
 #include <Common/HashTable/HashMap.h>
 #include <Common/HashTable/HashTable.h>
+#include <IO/Endian.h>
 
 #include <new>
 #include <variant>
@@ -66,19 +67,24 @@ struct HashWithMixSeed<StringKey24>
 
 struct StringHashTableHash
 {
+    using StringKey8Hasher = HashWithMixSeed<StringKey8>;
+    using StringKey16Hasher = HashWithMixSeed<StringKey16>;
+    using StringKey24Hasher = HashWithMixSeed<StringKey24>;
+    using StringRefHasher = StringRefHash;
+
     static size_t ALWAYS_INLINE operator()(StringKey8 key)
     {
-        return HashWithMixSeed<StringKey8>::operator()(key);
+        return StringKey8Hasher::operator()(key);
     }
     static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
     {
-        return HashWithMixSeed<StringKey16>::operator()(key);
+        return StringKey16Hasher::operator()(key);
     }
     static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
     {
-        return HashWithMixSeed<StringKey24>::operator()(key);
+        return StringKey24Hasher::operator()(key);
     }
-    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHash()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
 };
 
 template <typename Cell>
@@ -185,6 +191,92 @@ struct StringHashTableLookupResult
     friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; }
 };
 
+    template <typename KeyHolder, typename Func0, typename Func8, typename Func16, typename Func24, typename FuncStr>
+    static auto
+#if defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER)
+        NO_INLINE NO_SANITIZE_ADDRESS NO_SANITIZE_THREAD
+#else
+        ALWAYS_INLINE
+#endif
+        dispatchStringHashTable(size_t row, KeyHolder && key_holder, Func0 && func0, Func8 && func8, Func16 && func16, Func24 && func24, FuncStr && func_str)
+    {
+        const StringRef & x = keyHolderGetKey(key_holder);
+        const size_t sz = x.size;
+        if (sz == 0)
+        {
+            return func0(x, row);
+        }
+
+        if (x.data[sz - 1] == 0)
+        {
+            // Strings with trailing zeros are not representable as fixed-size
+            // string keys. Put them to the generic table.
+            return func_str(key_holder, row);
+        }
+
+        const char * p = x.data;
+        // pending bits that needs to be shifted out
+        const char s = (-sz & 7) * 8;
+        union
+        {
+            StringKey8 k8;
+            StringKey16 k16;
+            StringKey24 k24;
+            UInt64 n[3];
+        };
+        switch ((sz - 1) >> 3)
+        {
+        case 0: // 1..8 bytes
+        {
+            // first half page
+            if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
+            {
+                memcpy(&n[0], p, 8);
+                if constexpr (DB::isLittleEndian())
+                    n[0] &= (-1ULL >> s);
+                else
+                    n[0] &= (-1ULL << s);
+            }
+            else
+            {
+                const char * lp = x.data + x.size - 8;
+                memcpy(&n[0], lp, 8);
+                if constexpr (DB::isLittleEndian())
+                    n[0] >>= s;
+                else
+                    n[0] <<= s;
+            }
+            return func8(k8, row);
+        }
+        case 1: // 9..16 bytes
+        {
+            memcpy(&n[0], p, 8);
+            const char * lp = x.data + x.size - 8;
+            memcpy(&n[1], lp, 8);
+            if constexpr (DB::isLittleEndian())
+                n[1] >>= s;
+            else
+                n[1] <<= s;
+            return func16(k16, row);
+        }
+        case 2: // 17..24 bytes
+        {
+            memcpy(&n[0], p, 16);
+            const char * lp = x.data + x.size - 8;
+            memcpy(&n[2], lp, 8);
+            if constexpr (DB::isLittleEndian())
+                n[2] >>= s;
+            else
+                n[2] <<= s;
+            return func24(k24, row);
+        }
+        default: // >= 25 bytes
+        {
+            return func_str(key_holder, row);
+        }
+        }
+    }
+
 template <typename SubMaps>
 class StringHashTable : private boost::noncopyable
 {
@@ -221,6 +313,9 @@ class StringHashTable : private boost::noncopyable
     using LookupResult = StringHashTableLookupResult<typename cell_type::mapped_type>;
     using ConstLookupResult = StringHashTableLookupResult<const typename cell_type::mapped_type>;
 
+    static constexpr bool is_string_hash_map = true;
+    static constexpr bool is_two_level = false;
+
     StringHashTable() = default;
 
     explicit StringHashTable(size_t reserve_for_num_elements)
@@ -488,3 +583,64 @@ class StringHashTable : private boost::noncopyable
         ms.clearAndShrink();
     }
 };
+
+template <size_t SubMapIndex, bool is_two_level, typename Data>
+struct StringHashTableSubMapSelector;
+
+template <typename Data>
+struct StringHashTableSubMapSelector<0, false, Data>
+{
+    struct Hash
+    {
+        static ALWAYS_INLINE size_t operator()(const StringRef & ) { return 0; }
+    };
+
+    typename Data::T0 & getSubMap(size_t, Data & data)
+    {
+        return data.m0;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<1, false, Data>
+{
+    using Hash = StringHashTableHash::StringKey8Hasher;
+
+    typename Data::T1 & getSubMap(size_t, Data & data)
+    {
+        return data.m1;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<2, false, Data>
+{
+    using Hash = StringHashTableHash::StringKey16Hasher;
+
+    typename Data::T2 & getSubMap(size_t, Data & data)
+    {
+        return data.m2;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<3, false, Data>
+{
+    using Hash = StringHashTableHash::StringKey24Hasher;
+
+    typename Data::T3 & getSubMap(size_t, Data & data)
+    {
+        return data.m3;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<4, false, Data>
+{
+    using Hash = StringHashTableHash::StringRefHasher;
+
+    typename Data::Ts & getSubMap(size_t, Data & data)
+    {
+        return data.ms;
+    }
+};
diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h
index 01c14dd07c2..75a5402363d 100644
--- a/dbms/src/Common/HashTable/TwoLevelHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h
@@ -60,6 +60,9 @@ class TwoLevelHashTable : private boost::noncopyable
     static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
     static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
 
+    static constexpr bool is_string_hash_map = false;
+    static constexpr bool is_two_level = true;
+
     size_t hash(const Key & x) const { return Hash::operator()(x); }
 
     /// NOTE Bad for hash tables with more than 2^32 cells.
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 5608d0fd0f8..d217e0c0260 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -30,6 +30,9 @@ class TwoLevelStringHashTable : private boost::noncopyable
     static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
     static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
 
+    static constexpr bool is_string_hash_map = true;
+    static constexpr bool is_two_level = true;
+
     template <typename HashKeyType>
     size_t ALWAYS_INLINE hash(const HashKeyType & key) const
     {
@@ -301,3 +304,66 @@ class TwoLevelStringHashTable : private boost::noncopyable
         return res;
     }
 };
+
+template <typename Data>
+struct StringHashTableSubMapSelector<0, true, Data>
+{
+    struct Hash
+    {
+        static ALWAYS_INLINE size_t operator()(const StringRef & ) { return 0; }
+    };
+
+    typename Data::T0 & getSubMap(size_t hashval, Data & data)
+    {
+        const auto bucket = Data::getBucketFromHash(hashval);
+        return data.impls[bucket].m0;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<1, true, Data>
+{
+    using Hash = StringHashTableHash::StringKey8Hasher;
+
+    typename Data::T1 & getSubMap(size_t hashval, Data & data)
+    {
+        const auto bucket = Data::getBucketFromHash(hashval);
+        return data.impls[bucket].m1;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<2, true, Data>
+{
+    using Hash = StringHashTableHash::StringKey16Hasher;
+
+    typename Data::T2 & getSubMap(size_t hashval, Data & data)
+    {
+        const auto bucket = Data::getBucketFromHash(hashval);
+        return data.impls[bucket].m2;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<3, true, Data>
+{
+    using Hash = StringHashTableHash::StringKey24Hasher;
+
+    typename Data::T3 & getSubMap(size_t hashval, Data & data)
+    {
+        const auto bucket = Data::getBucketFromHash(hashval);
+        return data.impls[bucket].m3;
+    }
+};
+
+template <typename Data>
+struct StringHashTableSubMapSelector<4, true, Data>
+{
+    using Hash = StringHashTableHash::StringRefHasher;
+
+    typename Data::Ts & getSubMap(size_t hashval, Data & data)
+    {
+        const auto bucket = Data::getBucketFromHash(hashval);
+        return data.impls[bucket].ms;
+    }
+};
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 180799bd7ed..54cf52c673d 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -695,13 +695,9 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     try
     {
         if constexpr (only_lookup)
-        {
             return state.template findKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
-        }
         else
-        {
             return state.template emplaceKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
-        }
     }
     catch (ResizeException &)
     {
@@ -709,6 +705,73 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     }
 }
 
+// StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder.
+// return true when resize exception happens.
+template <size_t SubMapIndex, bool only_lookup, bool enable_prefetch, typename Method, typename StringKeyType>
+bool Aggregator::emplaceOrFindStringKey(
+        typename Method::Data & data,
+        typename Method::State & state,
+        const std::vector<size_t> & key_infos,
+        const std::vector<StringKeyType> & key_datas,
+        Arena & aggregates_pool,
+        std::vector<AggregateDataPtr> & places,
+        AggProcessInfo & agg_process_info) const
+{
+    RUNTIME_CHECK(key_infos.size() == key_datas.size());
+
+    using Hash = typename StringHashTableSubMapSelector<SubMapIndex, Method::Data::is_two_level, std::decay_t<typename Method::Data>>::Hash;
+    std::vector<size_t> hashvals(key_infos.size(), 0);
+    for (size_t i = 0; i < key_infos.size(); ++i)
+    {
+        hashvals[i] = Hash::operator()(keyHolderGetKey(key_datas[0]));
+    }
+
+    AggregateDataPtr agg_state = nullptr;
+    for (size_t i = 0; i < key_infos.size(); ++i)
+    {
+        try
+        {
+            if constexpr (only_lookup)
+            {
+                auto find_result = state.template findStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
+                if (find_result.isFound())
+                {
+                    agg_state = find_result.getMapped();
+                }
+                else
+                {
+                    agg_process_info.not_found_rows.push_back(key_infos[i]);
+                }
+            }
+            else
+            {
+                auto emplace_result = state.template emplaceStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
+                if (emplace_result.isInserted())
+                {
+                    emplace_result.setMapped(nullptr);
+
+                    agg_state = aggregates_pool.alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                    createAggregateStates(agg_state);
+
+                    emplace_result.setMapped(agg_state);
+                }
+                else
+                {
+                    agg_state = emplace_result.getMapped();
+                }
+                places.push_back(agg_state);
+            }
+        }
+        catch (ResizeException &)
+        {
+            // agg_process_info.set
+            // TODO handle exception
+            return true;
+        }
+    }
+    return false;
+}
+
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
 ALWAYS_INLINE void Aggregator::executeImplBatch(
     Method & method,
@@ -721,10 +784,10 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
 
     std::vector<std::string> sort_key_containers;
     sort_key_containers.resize(params.keys_size, "");
-    size_t agg_size = agg_process_info.end_row - agg_process_info.start_row;
+    size_t rows = agg_process_info.end_row - agg_process_info.start_row;
     fiu_do_on(FailPoints::force_agg_on_partial_block, {
-        if (agg_size > 0 && agg_process_info.start_row == 0)
-            agg_size = std::max(agg_size / 2, 1);
+        if (rows > 0 && agg_process_info.start_row == 0)
+            rows = std::max(rows / 2, 1);
     });
 
     /// Optimization for special case when there are no aggregate functions.
@@ -745,7 +808,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
 
         }
 
-        for (size_t i = 0; i < agg_size; ++i)
+        for (size_t i = 0; i < rows; ++i)
         {
             auto emplace_result_hold = emplaceOrFindKey<only_lookup, enable_prefetch>(
                 method,
@@ -789,7 +852,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
         {
             inst->batch_that->addBatchLookupTable8(
                 agg_process_info.start_row,
-                agg_size,
+                rows,
                 reinterpret_cast<AggregateDataPtr *>(method.data.data()),
                 inst->state_offset,
                 [&](AggregateDataPtr & aggregate_data) {
@@ -801,12 +864,12 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
                 inst->batch_arguments,
                 aggregates_pool);
         }
-        agg_process_info.start_row += agg_size;
+        agg_process_info.start_row += rows;
 
         // For key8, assume all rows are hit. No need to do state switch for auto pass through hashagg.
         // Because HashMap of key8 is basically a vector of size 256.
         if constexpr (collect_hit_rate)
-            agg_process_info.hit_row_cnt = agg_size;
+            agg_process_info.hit_row_cnt = rows;
 
         // Because all rows are hit, so state will not switch to Selective.
         if constexpr (only_lookup)
@@ -815,8 +878,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     }
 
     /// Generic case.
-
-    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[agg_size]);
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
     std::optional<size_t> processed_rows;
     std::vector<size_t> hashvals;
     if constexpr (enable_prefetch)
@@ -830,7 +892,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
                 aggregates_pool);
     }
 
-    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + agg_size; ++i)
+    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
     {
         AggregateDataPtr aggregate_data = nullptr;
 
@@ -899,6 +961,153 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     }
 }
 
+// Emplace key into StringHashMap/TwoLevelStringHashMap is seperated from other situations,
+// because it's easy to implement prefetch submap directly.
+// TODO not support resize execption
+template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
+ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
+        Method & method,
+        typename Method::State & state,
+        Arena * aggregates_pool,
+        AggProcessInfo & agg_process_info) const
+{
+    // collect_hit_rate and only_lookup cannot be true at the same time.
+    static_assert(!(collect_hit_rate && only_lookup));
+    static_assert(Method::Data::isStringHashMap);
+
+    std::vector<String> sort_key_containers;
+    sort_key_containers.resize(params.keys_size, "");
+
+    const size_t rows = agg_process_info.end_row = agg_process_info.start_row;
+    RUNTIME_CHECK_MSG(rows == state.total_rows, "executeImplBatchStringHashMap only handle resize exception for each Block instead of row");
+    const size_t reserve_size = rows / 4;
+
+    std::vector<size_t> key0_infos;
+    std::vector<StringRef> key0_datas;
+    key0_infos.reserve(reserve_size);
+    key0_datas.reserve(reserve_size);
+
+    std::vector<size_t> key8_infos;
+    std::vector<StringKey8> key8_datas;
+    key8_infos.reserve(reserve_size);
+    key8_datas.reserve(reserve_size);
+
+    std::vector<size_t> key16_infos;
+    std::vector<StringKey16> key16_datas;
+    key16_infos.reserve(reserve_size);
+    key16_datas.reserve(reserve_size);
+
+    std::vector<size_t> key24_infos;
+    std::vector<StringKey24> key24_datas;
+    key24_infos.reserve(reserve_size);
+    key24_datas.reserve(reserve_size);
+
+    std::vector<size_t> key_str_infos;
+    std::vector<ArenaKeyHolder> key_str_datas;
+    key_str_infos.reserve(reserve_size);
+    key_str_datas.reserve(reserve_size);
+
+    auto dispatch_callback_key0 = [&key0_infos, &key0_datas](const StringRef & key, size_t row) {
+        key0_infos.push_back(row);
+        key0_datas.push_back(key);
+    };
+    auto dispatch_callback_key8 = [&key8_infos, &key8_datas](const StringKey8 & key, size_t row) {
+        key8_infos.push_back(row);
+        key8_datas.push_back(key);
+    };
+    auto dispatch_callback_key16 = [&key16_infos, &key16_datas](const StringKey16 & key, size_t row) {
+        key16_infos.push_back(row);
+        key16_datas.push_back(key);
+    };
+    auto dispatch_callback_key24 = [&key24_infos, &key24_datas](const StringKey24 & key, size_t row) {
+        key24_infos.push_back(row);
+        key24_datas.push_back(key);
+    };
+    // Argument type is ArenaKeyHolder instead of StringRef,
+    // because it will only be persisted when insert into HashTable.
+    auto dispatch_callback_key_str = [&key_str_infos, &key_str_datas](const ArenaKeyHolder & key, size_t row) {
+        key_str_infos.push_back(row);
+        key_str_datas.push_back(key);
+    };
+    for (size_t i = 0; i < rows; ++i)
+    {
+        auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_containers);
+        dispatchStringHashTable(key_holder,
+                dispatch_callback_key0,
+                dispatch_callback_key8,
+                dispatch_callback_key16,
+                dispatch_callback_key24,
+                dispatch_callback_key_str);
+    }
+
+    std::vector<AggregateDataPtr> key0_places;
+    key0_places.reserve(key0_infos.size());
+
+    std::vector<AggregateDataPtr> key8_places;
+    key8_places.reserve(key8_infos.size());
+
+    std::vector<AggregateDataPtr> key16_places;
+    key16_places.reserve(key16_infos.size());
+
+    std::vector<AggregateDataPtr> key24_places;
+    key24_places.reserve(key24_infos.size());
+
+    std::vector<AggregateDataPtr> key_str_places;
+    key_str_places.reserve(key_str_infos.size());
+
+    if (!key0_infos.empty())
+    {
+        emplaceOrFindStringKey<0, false>(method.data, state, key0_infos, key0_datas, aggregates_pool, key0_places, agg_process_info);
+    }
+
+#define M(INDEX, INFO, DATA, PLACES) \
+    if (!(INFO).empty()) \
+    { \
+        if constexpr (enable_prefetch) \
+            emplaceOrFindStringKey<INDEX, true>(method.data, state, INFO, DATA, aggregates_pool, PLACES, agg_process_info); \
+        else \
+            emplaceOrFindStringKey<INDEX, true>(method.data, state, INFO, DATA, aggregates_pool, PLACES, agg_process_info); \
+    }
+
+    M(1, key8_infos, key8_datas, key8_places)
+    M(2, key16_infos, key16_datas, key16_places)
+    M(3, key24_infos, key24_datas, key24_places)
+    M(4, key_str_infos, key_str_datas, key_str_places)
+#undef M
+
+    RUNTIME_CHECK(rows == key0_places.size() + key8_places.size() + key16_places.size() + key24_places.size() + key_str_places.size());
+
+    std::vector<AggregateDataPtr> places(rows, nullptr);
+
+#define M(INFO, PLACES) \
+    for (size_t i = 0; i < (INFO).size(); ++i) \
+    { \
+        const auto row = (INFO)[i]; \
+        places[row] = (PLACES)[i]; \
+    }
+
+    M(key0_infos, key0_places)
+    M(key8_infos, key8_places)
+    M(key16_infos, key16_places)
+    M(key24_infos, key24_places)
+    M(key_str_infos, key_str_places)
+#undef M
+
+
+    for (AggregateFunctionInstruction * inst = agg_process_info.aggregate_functions_instructions.data(); inst->that;
+         ++inst)
+    {
+        inst->batch_that->addBatch(
+            agg_process_info.start_row,
+            rows,
+            &places[0],
+            inst->state_offset,
+            inst->batch_arguments,
+            aggregates_pool);
+    }
+    agg_process_info.start_row = rows;
+}
+
 void NO_INLINE
 Aggregator::executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena)
 {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 0f1365694ac..6cbbde71b41 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1461,6 +1461,13 @@ class Aggregator
         Arena * aggregates_pool,
         AggProcessInfo & agg_process_info) const;
 
+    template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
+    void executeImplBatchStringHashMap(
+            Method & method,
+            typename Method::State & state,
+            Arena * aggregates_pool,
+            AggProcessInfo & agg_process_info) const;
+
     template <bool only_lookup, bool enable_prefetch, typename Method>
     std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> emplaceOrFindKey(
         Method & method,
@@ -1470,6 +1477,16 @@ class Aggregator
         std::vector<std::string> & sort_key_containers,
         const std::vector<size_t> & hashvals) const;
 
+    template <size_t SubMapIndex, bool only_lookup, bool enable_prefetch, typename Method, typename StringKeyType>
+    bool emplaceOrFindStringKey(
+            typename Method::Data & data,
+            typename Method::State & state,
+            const std::vector<size_t> & key_infos,
+            const std::vector<StringKeyType> & key_datas,
+            Arena & aggregates_pool,
+            std::vector<AggregateDataPtr> & places,
+            AggProcessInfo & agg_process_info) const;
+
     /// For case when there are no keys (all aggregate into one row).
     static void executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena);
 
diff --git a/libs/libcommon/include/common/StringRef.h b/libs/libcommon/include/common/StringRef.h
index a87b54a7670..bf1ab026a49 100644
--- a/libs/libcommon/include/common/StringRef.h
+++ b/libs/libcommon/include/common/StringRef.h
@@ -180,7 +180,7 @@ inline size_t hashLessThan8(const char * data, size_t size)
 
 struct CRC32Hash
 {
-    size_t operator()(StringRef x) const
+    static size_t operator()(const StringRef & x)
     {
         const char * pos = x.data;
         size_t size = x.size;

From ec6e89231d74bee245cecd6b46f1254bc45b28fe Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Wed, 27 Nov 2024 15:21:31 +0800
Subject: [PATCH 04/24] handle resize exception done

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 .../AggregateFunctionGroupUniqArray.h         |   6 +-
 .../src/AggregateFunctions/KeyHolderHelpers.h |   2 +-
 dbms/src/Common/ColumnsHashing.h              |   6 +-
 dbms/src/Common/ColumnsHashingImpl.h          |  32 +-
 dbms/src/Common/HashTable/Hash.h              |  40 +-
 dbms/src/Common/HashTable/HashTable.h         |   2 +-
 .../src/Common/HashTable/HashTableKeyHolder.h |   8 +-
 dbms/src/Common/HashTable/StringHashMap.h     |  31 +-
 dbms/src/Common/HashTable/StringHashTable.h   | 190 +++++-----
 .../HashTable/TwoLevelStringHashTable.h       |  12 +-
 dbms/src/Interpreters/Aggregator.cpp          | 356 +++++++++++-------
 dbms/src/Interpreters/Aggregator.h            |  52 ++-
 12 files changed, 414 insertions(+), 323 deletions(-)

diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h
index 06dd57edf66..d3cbea74195 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h
@@ -182,18 +182,18 @@ class AggregateFunctionGroupUniqArrayGeneric
         {
             // We have to copy the keys to our arena.
             assert(arena != nullptr);
-            cur_set.emplace(ArenaKeyHolder{rhs_elem.getValue(), *arena}, it, inserted);
+            cur_set.emplace(ArenaKeyHolder{rhs_elem.getValue(), arena}, it, inserted);
         }
     }
 
     void insertResultInto(ConstAggregateDataPtr __restrict place, IColumn & to, Arena *) const override
     {
-        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
+        auto & arr_to = assert_cast<ColumnArray &>(to);
         ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
         IColumn & data_to = arr_to.getData();
 
         auto & set = this->data(place).value;
-        offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + set.size());
+        offsets_to.push_back((offsets_to.empty() ? 0 : offsets_to.back()) + set.size());
 
         for (auto & elem : set)
             deserializeAndInsert<is_plain_column>(elem.getValue(), data_to);
diff --git a/dbms/src/AggregateFunctions/KeyHolderHelpers.h b/dbms/src/AggregateFunctions/KeyHolderHelpers.h
index 6677866f0d3..b8a4ee0def3 100644
--- a/dbms/src/AggregateFunctions/KeyHolderHelpers.h
+++ b/dbms/src/AggregateFunctions/KeyHolderHelpers.h
@@ -24,7 +24,7 @@ inline auto getKeyHolder(const IColumn & column, size_t row_num, Arena & arena)
 {
     if constexpr (is_plain_column)
     {
-        return ArenaKeyHolder{column.getDataAt(row_num), arena};
+        return ArenaKeyHolder{column.getDataAt(row_num), &arena};
     }
     else
     {
diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index e14a793567c..aabe0733f8c 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -135,7 +135,7 @@ struct HashMethodString
         {
             if (likely(collator))
                 key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
-            return ArenaKeyHolder{key, *pool};
+            return ArenaKeyHolder{key, pool};
         }
         else
         {
@@ -172,7 +172,7 @@ struct HashMethodStringBin
         auto last_offset = row == 0 ? 0 : offsets[row - 1];
         StringRef key(chars + last_offset, offsets[row] - last_offset - 1);
         key = BinCollatorSortKey<padding>(key.data, key.size);
-        return ArenaKeyHolder{key, *pool};
+        return ArenaKeyHolder{key, pool};
     }
 
 protected:
@@ -425,7 +425,7 @@ struct HashMethodFixedString
 
         if constexpr (place_string_to_arena)
         {
-            return ArenaKeyHolder{key, *pool};
+            return ArenaKeyHolder{key, pool};
         }
         else
         {
diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index 0c8d0bc1a49..aa583f1a722 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -16,8 +16,8 @@
 
 #include <Columns/IColumn.h>
 #include <Common/HashTable/HashTable.h>
-#include <Common/HashTable/StringHashTable.h>
 #include <Common/HashTable/HashTableKeyHolder.h>
+#include <Common/HashTable/StringHashTable.h>
 #include <Common/assert_cast.h>
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/AggregationCommon.h>
@@ -174,17 +174,19 @@ class HashMethodBase
         {
             return findKeyImpl<false>(keyHolderGetKey(key_holder), data, 0);
         }
-
     }
 
+    // TODO emplaceStringKey merge with emplaceKey?
     template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
     ALWAYS_INLINE inline EmplaceResult emplaceStringKey(
-            Data & data,
-            size_t idx,
-            const std::vector<StringKeyType> & datas,
-            const std::vector<size_t> & hashvals)
+        Data & data,
+        size_t idx,
+        std::vector<StringKeyType> & datas, // TODO const
+        const std::vector<size_t> & hashvals)
     {
-        auto & submap = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(data);
+        auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
+            hashvals[idx],
+            data);
         if constexpr (enable_prefetch)
         {
             const auto prefetch_idx = idx + prefetch_step;
@@ -198,12 +200,14 @@ class HashMethodBase
     // TODO Macro with emplaceStringKey
     template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
     ALWAYS_INLINE inline FindResult findStringKey(
-            Data & data,
-            size_t idx,
-            const std::vector<StringKeyType> & datas,
-            const std::vector<size_t> & hashvals)
+        Data & data,
+        size_t idx,
+        std::vector<StringKeyType> & datas, // TODO const
+        const std::vector<size_t> & hashvals)
     {
-        auto & submap = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(data);
+        auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
+            hashvals[idx],
+            data);
         if constexpr (enable_prefetch)
         {
             const auto prefetch_idx = idx + prefetch_step;
@@ -211,7 +215,7 @@ class HashMethodBase
                 submap.prefetch(hashvals[prefetch_idx]);
         }
 
-        return findKeyImpl<true>(datas[idx], submap, hashvals[idx]);
+        return findKeyImpl<true>(keyHolderGetKey(datas[idx]), submap, hashvals[idx]);
     }
 
     template <typename Data>
@@ -301,7 +305,7 @@ class HashMethodBase
     }
 
     template <bool use_hashval, typename Data, typename Key>
-    ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data, size_t hashval)
+    ALWAYS_INLINE inline FindResult findKeyImpl(Key & key, Data & data, size_t hashval)
     {
         if constexpr (Cache::consecutive_keys_optimization)
         {
diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h
index 883ec8ab6ff..207919a347e 100644
--- a/dbms/src/Common/HashTable/Hash.h
+++ b/dbms/src/Common/HashTable/Hash.h
@@ -130,8 +130,8 @@ inline DB::UInt64 wideIntHashCRC32(const T & x, DB::UInt64 updated_value)
         return updated_value;
     }
     static_assert(
-        DB::IsDecimal<
-            T> || is_boost_number_v<T> || std::is_same_v<T, DB::UInt128> || std::is_same_v<T, DB::Int128> || std::is_same_v<T, DB::UInt256>);
+        DB::IsDecimal<T> || is_boost_number_v<T> || std::is_same_v<T, DB::UInt128> || std::is_same_v<T, DB::Int128>
+        || std::is_same_v<T, DB::UInt256>);
     __builtin_unreachable();
 }
 
@@ -244,8 +244,8 @@ inline size_t defaultHash64(const std::enable_if_t<!is_fit_register<T>, T> & key
         return boost::multiprecision::hash_value(key);
     }
     static_assert(
-        is_boost_number_v<
-            T> || std::is_same_v<T, DB::UInt128> || std::is_same_v<T, DB::Int128> || std::is_same_v<T, DB::UInt256>);
+        is_boost_number_v<T> || std::is_same_v<T, DB::UInt128> || std::is_same_v<T, DB::Int128>
+        || std::is_same_v<T, DB::UInt256>);
     __builtin_unreachable();
 }
 
@@ -297,20 +297,26 @@ inline size_t hashCRC32(const std::enable_if_t<!is_fit_register<T>, T> & key)
 template <typename T>
 struct HashCRC32;
 
-#define DEFINE_HASH(T)                                               \
-    template <>                                                      \
-    struct HashCRC32<T>                                              \
-    {                                                                \
-        static_assert(is_fit_register<T>);                           \
-        size_t operator()(T key) const { return hashCRC32<T>(key); } \
+#define DEFINE_HASH(T)                     \
+    template <>                            \
+    struct HashCRC32<T>                    \
+    {                                      \
+        static_assert(is_fit_register<T>); \
+        size_t operator()(T key) const     \
+        {                                  \
+            return hashCRC32<T>(key);      \
+        }                                  \
     };
 
-#define DEFINE_HASH_WIDE(T)                                                  \
-    template <>                                                              \
-    struct HashCRC32<T>                                                      \
-    {                                                                        \
-        static_assert(!is_fit_register<T>);                                  \
-        size_t operator()(const T & key) const { return hashCRC32<T>(key); } \
+#define DEFINE_HASH_WIDE(T)                    \
+    template <>                                \
+    struct HashCRC32<T>                        \
+    {                                          \
+        static_assert(!is_fit_register<T>);    \
+        size_t operator()(const T & key) const \
+        {                                      \
+            return hashCRC32<T>(key);          \
+        }                                      \
     };
 
 DEFINE_HASH(DB::UInt8)
@@ -535,7 +541,7 @@ struct HashWithMixSeed<DB::Int256>
 
 template <>
 struct HashWithMixSeed<DB::UInt256>
-{ 
+{
     static inline size_t operator()(const DB::UInt256 & v)
     {
         return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_uint256(0, v));
diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index 12ebc49756c..f8d44e8c406 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -859,7 +859,7 @@ class HashTable
         (void)hashval;
 #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
         size_t place_value = grower.place(hashval);
-        __mm_prefetch((const char*)(&buf[place_value]), _MM_HINT_NTA);
+        __mm_prefetch((const char *)(&buf[place_value]), _MM_HINT_NTA);
 #elif defined(__GNUC__)
         size_t place_value = grower.place(hashval);
         __builtin_prefetch(static_cast<const void *>(&buf[place_value]));
diff --git a/dbms/src/Common/HashTable/HashTableKeyHolder.h b/dbms/src/Common/HashTable/HashTableKeyHolder.h
index 01b06dce87d..dd8a4b53376 100644
--- a/dbms/src/Common/HashTable/HashTableKeyHolder.h
+++ b/dbms/src/Common/HashTable/HashTableKeyHolder.h
@@ -91,8 +91,8 @@ namespace DB
   */
 struct ArenaKeyHolder
 {
-    StringRef key;
-    Arena & pool;
+    StringRef key{};
+    Arena * pool = nullptr;
 };
 
 } // namespace DB
@@ -111,14 +111,14 @@ inline void ALWAYS_INLINE keyHolderPersistKey(DB::ArenaKeyHolder & holder)
 {
     // Hash table shouldn't ask us to persist a zero key
     assert(holder.key.size > 0);
-    holder.key.data = holder.pool.insert(holder.key.data, holder.key.size);
+    holder.key.data = holder.pool->insert(holder.key.data, holder.key.size);
 }
 
 inline void ALWAYS_INLINE keyHolderPersistKey(DB::ArenaKeyHolder && holder)
 {
     // Hash table shouldn't ask us to persist a zero key
     assert(holder.key.size > 0);
-    holder.key.data = holder.pool.insert(holder.key.data, holder.key.size);
+    holder.key.data = holder.pool->insert(holder.key.data, holder.key.size);
 }
 
 inline void ALWAYS_INLINE keyHolderDiscardKey(DB::ArenaKeyHolder &) {}
diff --git a/dbms/src/Common/HashTable/StringHashMap.h b/dbms/src/Common/HashTable/StringHashMap.h
index cad653907fa..a070f0ef0a9 100644
--- a/dbms/src/Common/HashTable/StringHashMap.h
+++ b/dbms/src/Common/HashTable/StringHashMap.h
@@ -92,30 +92,13 @@ struct StringHashMapSubMaps
 {
     using Hash = StringHashTableHash;
     using T0 = StringHashTableEmpty<StringHashMapCell<StringRef, TMapped>>;
-    using T1 = HashMapTable<
-        StringKey8,
-        StringHashMapCell<StringKey8, TMapped>,
-        Hash,
-        StringHashTableGrower<>,
-        Allocator>;
-    using T2 = HashMapTable<
-        StringKey16,
-        StringHashMapCell<StringKey16, TMapped>,
-        Hash,
-        StringHashTableGrower<>,
-        Allocator>;
-    using T3 = HashMapTable<
-        StringKey24,
-        StringHashMapCell<StringKey24, TMapped>,
-        Hash,
-        StringHashTableGrower<>,
-        Allocator>;
-    using Ts = HashMapTable<
-        StringRef,
-        StringHashMapCell<StringRef, TMapped>,
-        Hash,
-        StringHashTableGrower<>,
-        Allocator>;
+    using T1
+        = HashMapTable<StringKey8, StringHashMapCell<StringKey8, TMapped>, Hash, StringHashTableGrower<>, Allocator>;
+    using T2
+        = HashMapTable<StringKey16, StringHashMapCell<StringKey16, TMapped>, Hash, StringHashTableGrower<>, Allocator>;
+    using T3
+        = HashMapTable<StringKey24, StringHashMapCell<StringKey24, TMapped>, Hash, StringHashTableGrower<>, Allocator>;
+    using Ts = HashMapTable<StringRef, StringHashMapCell<StringRef, TMapped>, Hash, StringHashTableGrower<>, Allocator>;
 };
 
 template <typename TMapped, typename Allocator = HashTableAllocator>
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index f906b043a9e..a511ce47671 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -72,18 +72,9 @@ struct StringHashTableHash
     using StringKey24Hasher = HashWithMixSeed<StringKey24>;
     using StringRefHasher = StringRefHash;
 
-    static size_t ALWAYS_INLINE operator()(StringKey8 key)
-    {
-        return StringKey8Hasher::operator()(key);
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
-    {
-        return StringKey16Hasher::operator()(key);
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
-    {
-        return StringKey24Hasher::operator()(key);
-    }
+    static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
     static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
 };
 
@@ -191,97 +182,106 @@ struct StringHashTableLookupResult
     friend bool operator!=(const std::nullptr_t &, const StringHashTableLookupResult & b) { return b.mapped_ptr; }
 };
 
-    template <typename KeyHolder, typename Func0, typename Func8, typename Func16, typename Func24, typename FuncStr>
-    static auto
+template <typename KeyHolder, typename Func0, typename Func8, typename Func16, typename Func24, typename FuncStr>
+static auto
 #if defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER)
-        NO_INLINE NO_SANITIZE_ADDRESS NO_SANITIZE_THREAD
+    NO_INLINE NO_SANITIZE_ADDRESS NO_SANITIZE_THREAD
 #else
-        ALWAYS_INLINE
+    ALWAYS_INLINE
 #endif
-        dispatchStringHashTable(size_t row, KeyHolder && key_holder, Func0 && func0, Func8 && func8, Func16 && func16, Func24 && func24, FuncStr && func_str)
+    dispatchStringHashTable(
+        size_t row,
+        KeyHolder && key_holder,
+        Func0 && func0,
+        Func8 && func8,
+        Func16 && func16,
+        Func24 && func24,
+        FuncStr && func_str)
+{
+    const StringRef & x = keyHolderGetKey(key_holder);
+    const size_t sz = x.size;
+    if (sz == 0)
     {
-        const StringRef & x = keyHolderGetKey(key_holder);
-        const size_t sz = x.size;
-        if (sz == 0)
-        {
-            return func0(x, row);
-        }
+        return func0(x, row);
+    }
 
-        if (x.data[sz - 1] == 0)
-        {
-            // Strings with trailing zeros are not representable as fixed-size
-            // string keys. Put them to the generic table.
-            return func_str(key_holder, row);
-        }
+    if (x.data[sz - 1] == 0)
+    {
+        // Strings with trailing zeros are not representable as fixed-size
+        // string keys. Put them to the generic table.
+        return func_str(key_holder, row);
+    }
 
-        const char * p = x.data;
-        // pending bits that needs to be shifted out
-        const char s = (-sz & 7) * 8;
-        union
-        {
-            StringKey8 k8;
-            StringKey16 k16;
-            StringKey24 k24;
-            UInt64 n[3];
-        };
-        switch ((sz - 1) >> 3)
-        {
-        case 0: // 1..8 bytes
-        {
-            // first half page
-            if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
-            {
-                memcpy(&n[0], p, 8);
-                if constexpr (DB::isLittleEndian())
-                    n[0] &= (-1ULL >> s);
-                else
-                    n[0] &= (-1ULL << s);
-            }
-            else
-            {
-                const char * lp = x.data + x.size - 8;
-                memcpy(&n[0], lp, 8);
-                if constexpr (DB::isLittleEndian())
-                    n[0] >>= s;
-                else
-                    n[0] <<= s;
-            }
-            return func8(k8, row);
-        }
-        case 1: // 9..16 bytes
+    const char * p = x.data;
+    // pending bits that needs to be shifted out
+    const char s = (-sz & 7) * 8;
+    union
+    {
+        StringKey8 k8;
+        StringKey16 k16;
+        StringKey24 k24;
+        UInt64 n[3];
+    };
+    switch ((sz - 1) >> 3)
+    {
+    case 0: // 1..8 bytes
+    {
+        // first half page
+        if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
         {
             memcpy(&n[0], p, 8);
-            const char * lp = x.data + x.size - 8;
-            memcpy(&n[1], lp, 8);
             if constexpr (DB::isLittleEndian())
-                n[1] >>= s;
+                n[0] &= (-1ULL >> s);
             else
-                n[1] <<= s;
-            return func16(k16, row);
+                n[0] &= (-1ULL << s);
         }
-        case 2: // 17..24 bytes
+        else
         {
-            memcpy(&n[0], p, 16);
             const char * lp = x.data + x.size - 8;
-            memcpy(&n[2], lp, 8);
+            memcpy(&n[0], lp, 8);
             if constexpr (DB::isLittleEndian())
-                n[2] >>= s;
+                n[0] >>= s;
             else
-                n[2] <<= s;
-            return func24(k24, row);
-        }
-        default: // >= 25 bytes
-        {
-            return func_str(key_holder, row);
-        }
+                n[0] <<= s;
         }
+        return func8(k8, row);
+    }
+    case 1: // 9..16 bytes
+    {
+        memcpy(&n[0], p, 8);
+        const char * lp = x.data + x.size - 8;
+        memcpy(&n[1], lp, 8);
+        if constexpr (DB::isLittleEndian())
+            n[1] >>= s;
+        else
+            n[1] <<= s;
+        return func16(k16, row);
+    }
+    case 2: // 17..24 bytes
+    {
+        memcpy(&n[0], p, 16);
+        const char * lp = x.data + x.size - 8;
+        memcpy(&n[2], lp, 8);
+        if constexpr (DB::isLittleEndian())
+            n[2] >>= s;
+        else
+            n[2] <<= s;
+        return func24(k24, row);
+    }
+    default: // >= 25 bytes
+    {
+        return func_str(key_holder, row);
     }
+    }
+}
 
 template <typename SubMaps>
 class StringHashTable : private boost::noncopyable
 {
 protected:
     static constexpr size_t NUM_MAPS = 5;
+    using Self = StringHashTable;
+
     // Map for storing empty string
     using T0 = typename SubMaps::T0;
 
@@ -292,10 +292,11 @@ class StringHashTable : private boost::noncopyable
 
     // Long strings are stored as StringRef along with saved hash
     using Ts = typename SubMaps::Ts;
-    using Self = StringHashTable;
 
     template <typename, typename, size_t>
     friend class TwoLevelStringHashTable;
+    template <size_t, bool, typename>
+    friend struct StringHashTableSubMapSelector;
 
     T0 m0;
     T1 m1;
@@ -592,13 +593,10 @@ struct StringHashTableSubMapSelector<0, false, Data>
 {
     struct Hash
     {
-        static ALWAYS_INLINE size_t operator()(const StringRef & ) { return 0; }
+        static ALWAYS_INLINE size_t operator()(const StringRef &) { return 0; }
     };
 
-    typename Data::T0 & getSubMap(size_t, Data & data)
-    {
-        return data.m0;
-    }
+    static typename Data::T0 & getSubMap(size_t, Data & data) { return data.m0; }
 };
 
 template <typename Data>
@@ -606,10 +604,7 @@ struct StringHashTableSubMapSelector<1, false, Data>
 {
     using Hash = StringHashTableHash::StringKey8Hasher;
 
-    typename Data::T1 & getSubMap(size_t, Data & data)
-    {
-        return data.m1;
-    }
+    static typename Data::T1 & getSubMap(size_t, Data & data) { return data.m1; }
 };
 
 template <typename Data>
@@ -617,10 +612,7 @@ struct StringHashTableSubMapSelector<2, false, Data>
 {
     using Hash = StringHashTableHash::StringKey16Hasher;
 
-    typename Data::T2 & getSubMap(size_t, Data & data)
-    {
-        return data.m2;
-    }
+    static typename Data::T2 & getSubMap(size_t, Data & data) { return data.m2; }
 };
 
 template <typename Data>
@@ -628,10 +620,7 @@ struct StringHashTableSubMapSelector<3, false, Data>
 {
     using Hash = StringHashTableHash::StringKey24Hasher;
 
-    typename Data::T3 & getSubMap(size_t, Data & data)
-    {
-        return data.m3;
-    }
+    static typename Data::T3 & getSubMap(size_t, Data & data) { return data.m3; }
 };
 
 template <typename Data>
@@ -639,8 +628,5 @@ struct StringHashTableSubMapSelector<4, false, Data>
 {
     using Hash = StringHashTableHash::StringRefHasher;
 
-    typename Data::Ts & getSubMap(size_t, Data & data)
-    {
-        return data.ms;
-    }
+    static typename Data::Ts & getSubMap(size_t, Data & data) { return data.ms; }
 };
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index d217e0c0260..e7ea1bb8fce 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -310,10 +310,10 @@ struct StringHashTableSubMapSelector<0, true, Data>
 {
     struct Hash
     {
-        static ALWAYS_INLINE size_t operator()(const StringRef & ) { return 0; }
+        static ALWAYS_INLINE size_t operator()(const StringRef &) { return 0; }
     };
 
-    typename Data::T0 & getSubMap(size_t hashval, Data & data)
+    static typename Data::Impl::T0 & getSubMap(size_t hashval, Data & data)
     {
         const auto bucket = Data::getBucketFromHash(hashval);
         return data.impls[bucket].m0;
@@ -325,7 +325,7 @@ struct StringHashTableSubMapSelector<1, true, Data>
 {
     using Hash = StringHashTableHash::StringKey8Hasher;
 
-    typename Data::T1 & getSubMap(size_t hashval, Data & data)
+    static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
         const auto bucket = Data::getBucketFromHash(hashval);
         return data.impls[bucket].m1;
@@ -337,7 +337,7 @@ struct StringHashTableSubMapSelector<2, true, Data>
 {
     using Hash = StringHashTableHash::StringKey16Hasher;
 
-    typename Data::T2 & getSubMap(size_t hashval, Data & data)
+    static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
         const auto bucket = Data::getBucketFromHash(hashval);
         return data.impls[bucket].m2;
@@ -349,7 +349,7 @@ struct StringHashTableSubMapSelector<3, true, Data>
 {
     using Hash = StringHashTableHash::StringKey24Hasher;
 
-    typename Data::T3 & getSubMap(size_t hashval, Data & data)
+    static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
         const auto bucket = Data::getBucketFromHash(hashval);
         return data.impls[bucket].m3;
@@ -361,7 +361,7 @@ struct StringHashTableSubMapSelector<4, true, Data>
 {
     using Hash = StringHashTableHash::StringRefHasher;
 
-    typename Data::Ts & getSubMap(size_t hashval, Data & data)
+    static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
         const auto bucket = Data::getBucketFromHash(hashval);
         return data.impls[bucket].ms;
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 54cf52c673d..4faec37ce9d 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -666,14 +666,37 @@ void NO_INLINE Aggregator::executeImpl(
     typename Method::State state(agg_process_info.key_columns, key_sizes, collators);
 
     if (method.data.getBufferSizeInCells() < 8192)
-        executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+    {
+        if constexpr (Method::Data::is_string_hash_map)
+            executeImplBatchStringHashMap<collect_hit_rate, only_lookup, false>(
+                method,
+                state,
+                aggregates_pool,
+                agg_process_info);
+        else
+            executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+    }
     else
-        executeImplBatch<collect_hit_rate, only_lookup, true>(method, state, aggregates_pool, agg_process_info);
+    {
+        if constexpr (Method::Data::is_string_hash_map)
+            executeImplBatchStringHashMap<collect_hit_rate, only_lookup, true>(
+                method,
+                state,
+                aggregates_pool,
+                agg_process_info);
+        else
+            executeImplBatch<collect_hit_rate, only_lookup, true>(method, state, aggregates_pool, agg_process_info);
+    }
 }
 
 template <typename Data, typename State>
-std::vector<size_t> getHashVals(size_t start_row, size_t end_row, const Data & data, const State & state,
-        std::vector<String> & sort_key_containers, Arena * pool)
+std::vector<size_t> getHashVals(
+    size_t start_row,
+    size_t end_row,
+    const Data & data,
+    const State & state,
+    std::vector<String> & sort_key_containers,
+    Arena * pool)
 {
     std::vector<size_t> hashvals(state.total_rows, 0);
     for (size_t i = start_row; i < end_row; ++i)
@@ -695,9 +718,15 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     try
     {
         if constexpr (only_lookup)
-            return state.template findKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
+            return state
+                .template findKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
         else
-            return state.template emplaceKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
+            return state.template emplaceKey<enable_prefetch>(
+                method.data,
+                index,
+                aggregates_pool,
+                sort_key_containers,
+                hashvals);
     }
     catch (ResizeException &)
     {
@@ -707,19 +736,27 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
 
 // StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder.
 // return true when resize exception happens.
-template <size_t SubMapIndex, bool only_lookup, bool enable_prefetch, typename Method, typename StringKeyType>
-bool Aggregator::emplaceOrFindStringKey(
-        typename Method::Data & data,
-        typename Method::State & state,
-        const std::vector<size_t> & key_infos,
-        const std::vector<StringKeyType> & key_datas,
-        Arena & aggregates_pool,
-        std::vector<AggregateDataPtr> & places,
-        AggProcessInfo & agg_process_info) const
+template <
+    size_t SubMapIndex,
+    bool collect_hit_rate,
+    bool only_lookup,
+    bool enable_prefetch,
+    typename Data,
+    typename State,
+    typename StringKeyType>
+size_t Aggregator::emplaceOrFindStringKey(
+    Data & data,
+    State & state,
+    const std::vector<size_t> & key_infos,
+    std::vector<StringKeyType> & key_datas, // TODO const
+    Arena & aggregates_pool,
+    std::vector<AggregateDataPtr> & places,
+    AggProcessInfo & agg_process_info) const
 {
+    static_assert(!(collect_hit_rate && only_lookup));
     RUNTIME_CHECK(key_infos.size() == key_datas.size());
 
-    using Hash = typename StringHashTableSubMapSelector<SubMapIndex, Method::Data::is_two_level, std::decay_t<typename Method::Data>>::Hash;
+    using Hash = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::Hash;
     std::vector<size_t> hashvals(key_infos.size(), 0);
     for (size_t i = 0; i < key_infos.size(); ++i)
     {
@@ -733,7 +770,8 @@ bool Aggregator::emplaceOrFindStringKey(
         {
             if constexpr (only_lookup)
             {
-                auto find_result = state.template findStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
+                auto find_result
+                    = state.template findStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
                 if (find_result.isFound())
                 {
                     agg_state = find_result.getMapped();
@@ -745,7 +783,8 @@ bool Aggregator::emplaceOrFindStringKey(
             }
             else
             {
-                auto emplace_result = state.template emplaceStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
+                auto emplace_result
+                    = state.template emplaceStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
                 if (emplace_result.isInserted())
                 {
                     emplace_result.setMapped(nullptr);
@@ -758,18 +797,19 @@ bool Aggregator::emplaceOrFindStringKey(
                 else
                 {
                     agg_state = emplace_result.getMapped();
+
+                    if constexpr (collect_hit_rate)
+                        ++agg_process_info.hit_row_cnt;
                 }
-                places.push_back(agg_state);
+                places[i] = agg_state;
             }
         }
         catch (ResizeException &)
         {
-            // agg_process_info.set
-            // TODO handle exception
-            return true;
+            return i;
         }
     }
-    return false;
+    return key_infos.size();
 }
 
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
@@ -799,13 +839,12 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
         if constexpr (enable_prefetch)
         {
             hashvals = getHashVals(
-                    agg_process_info.start_row,
-                    agg_process_info.end_row,
-                    method.data,
-                    state,
-                    sort_key_containers,
-                    aggregates_pool);
-
+                agg_process_info.start_row,
+                agg_process_info.end_row,
+                method.data,
+                state,
+                sort_key_containers,
+                aggregates_pool);
         }
 
         for (size_t i = 0; i < rows; ++i)
@@ -884,20 +923,25 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     if constexpr (enable_prefetch)
     {
         hashvals = getHashVals(
-                agg_process_info.start_row,
-                agg_process_info.end_row,
-                method.data,
-                state,
-                sort_key_containers,
-                aggregates_pool);
+            agg_process_info.start_row,
+            agg_process_info.end_row,
+            method.data,
+            state,
+            sort_key_containers,
+            aggregates_pool);
     }
 
     for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
     {
         AggregateDataPtr aggregate_data = nullptr;
 
-        auto emplace_result_holder
-            = emplaceOrFindKey<only_lookup, enable_prefetch>(method, state, i, *aggregates_pool, sort_key_containers, hashvals);
+        auto emplace_result_holder = emplaceOrFindKey<only_lookup, enable_prefetch>(
+            method,
+            state,
+            i,
+            *aggregates_pool,
+            sort_key_containers,
+            hashvals);
         if unlikely (!emplace_result_holder.has_value())
         {
             LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
@@ -961,129 +1005,174 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     }
 }
 
+#define M(SUBMAPINDEX)                                                              \
+    template <typename StringKeyType>                                               \
+    void setupExceptionRecoveryInfoForStringHashTable(                              \
+        Aggregator::AggProcessInfo & agg_process_info,                              \
+        size_t row,                                                                 \
+        const std::vector<size_t> & key_infos,                                      \
+        const std::vector<StringKeyType> & key_datas,                               \
+        std::integral_constant<size_t, (SUBMAPINDEX)>)                              \
+    {                                                                               \
+        agg_process_info.submap_m##SUBMAPINDEX##_infos                              \
+            = std::vector<size_t>(key_infos.begin() + row, key_infos.end());        \
+        agg_process_info.submap_m##SUBMAPINDEX##_datas                              \
+            = std::vector<StringKeyType>(key_datas.begin() + row, key_datas.end()); \
+    }
+
+M(0)
+M(1)
+M(2)
+M(3)
+M(4)
+
+#undef M
+
 // Emplace key into StringHashMap/TwoLevelStringHashMap is seperated from other situations,
 // because it's easy to implement prefetch submap directly.
-// TODO not support resize execption
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
 ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
-        Method & method,
-        typename Method::State & state,
-        Arena * aggregates_pool,
-        AggProcessInfo & agg_process_info) const
+    Method & method,
+    typename Method::State & state,
+    Arena * aggregates_pool,
+    AggProcessInfo & agg_process_info) const
 {
     // collect_hit_rate and only_lookup cannot be true at the same time.
     static_assert(!(collect_hit_rate && only_lookup));
-    static_assert(Method::Data::isStringHashMap);
+    static_assert(Method::Data::is_string_hash_map);
+
+#define M(SUBMAPINDEX)                                        \
+    RUNTIME_CHECK(                                            \
+        agg_process_info.submap_m##SUBMAPINDEX##_infos.size() \
+        == agg_process_info.submap_m##SUBMAPINDEX##_datas.size());
+
+    M(0)
+    M(1)
+    M(2)
+    M(3)
+    M(4)
+#undef M
 
     std::vector<String> sort_key_containers;
     sort_key_containers.resize(params.keys_size, "");
 
-    const size_t rows = agg_process_info.end_row = agg_process_info.start_row;
-    RUNTIME_CHECK_MSG(rows == state.total_rows, "executeImplBatchStringHashMap only handle resize exception for each Block instead of row");
-    const size_t reserve_size = rows / 4;
-
-    std::vector<size_t> key0_infos;
-    std::vector<StringRef> key0_datas;
-    key0_infos.reserve(reserve_size);
-    key0_datas.reserve(reserve_size);
-
-    std::vector<size_t> key8_infos;
-    std::vector<StringKey8> key8_datas;
-    key8_infos.reserve(reserve_size);
-    key8_datas.reserve(reserve_size);
-
-    std::vector<size_t> key16_infos;
-    std::vector<StringKey16> key16_datas;
-    key16_infos.reserve(reserve_size);
-    key16_datas.reserve(reserve_size);
-
-    std::vector<size_t> key24_infos;
-    std::vector<StringKey24> key24_datas;
-    key24_infos.reserve(reserve_size);
-    key24_datas.reserve(reserve_size);
-
-    std::vector<size_t> key_str_infos;
-    std::vector<ArenaKeyHolder> key_str_datas;
-    key_str_infos.reserve(reserve_size);
-    key_str_datas.reserve(reserve_size);
-
-    auto dispatch_callback_key0 = [&key0_infos, &key0_datas](const StringRef & key, size_t row) {
-        key0_infos.push_back(row);
-        key0_datas.push_back(key);
-    };
-    auto dispatch_callback_key8 = [&key8_infos, &key8_datas](const StringKey8 & key, size_t row) {
-        key8_infos.push_back(row);
-        key8_datas.push_back(key);
-    };
-    auto dispatch_callback_key16 = [&key16_infos, &key16_datas](const StringKey16 & key, size_t row) {
-        key16_infos.push_back(row);
-        key16_datas.push_back(key);
-    };
-    auto dispatch_callback_key24 = [&key24_infos, &key24_datas](const StringKey24 & key, size_t row) {
-        key24_infos.push_back(row);
-        key24_datas.push_back(key);
-    };
-    // Argument type is ArenaKeyHolder instead of StringRef,
-    // because it will only be persisted when insert into HashTable.
-    auto dispatch_callback_key_str = [&key_str_infos, &key_str_datas](const ArenaKeyHolder & key, size_t row) {
-        key_str_infos.push_back(row);
-        key_str_datas.push_back(key);
-    };
-    for (size_t i = 0; i < rows; ++i)
+#define M(INFO, DATA, KEYTYPE) \
+    std::vector<size_t>(INFO); \
+    std::vector<KEYTYPE>(DATA);
+
+    M(key0_infos, key0_datas, StringRef)
+    M(key8_infos, key8_datas, StringKey8)
+    M(key16_infos, key16_datas, StringKey16)
+    M(key24_infos, key24_datas, StringKey24)
+    M(key_str_infos, key_str_datas, ArenaKeyHolder)
+#undef M
+
+    const size_t rows = agg_process_info.end_row - agg_process_info.start_row;
+
+    if likely (agg_process_info.allBlockDataHandled())
     {
-        auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_containers);
-        dispatchStringHashTable(key_holder,
+        // No resize exception happens, so this is a new Block.
+        RUNTIME_CHECK(agg_process_info.start_row == 0);
+        RUNTIME_CHECK_MSG(
+            rows == state.total_rows,
+            "executeImplBatchStringHashMap only handle resize exception for each Block instead of row");
+        const size_t reserve_size = rows / 4;
+
+#define M(INFO, DATA, SUBMAPINDEX, KEYTYPE)                                                     \
+    (INFO).reserve(reserve_size);                                                               \
+    (DATA).reserve(reserve_size);                                                               \
+    auto dispatch_callback_key##SUBMAPINDEX = [&INFO, &DATA](const KEYTYPE & key, size_t row) { \
+        (INFO).push_back(row);                                                                  \
+        (DATA).push_back(key);                                                                  \
+    };
+
+        M(key0_infos, key0_datas, 0, StringRef)
+        M(key8_infos, key8_datas, 8, StringKey8)
+        M(key16_infos, key16_datas, 16, StringKey16)
+        M(key24_infos, key24_datas, 24, StringKey24)
+        // Argument type is ArenaKeyHolder instead of StringRef,
+        // because it will only be persisted when insert into HashTable.
+        M(key_str_infos, key_str_datas, str, ArenaKeyHolder)
+#undef M
+
+        for (size_t i = 0; i < rows; ++i)
+        {
+            auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_containers);
+            dispatchStringHashTable(
+                i,
+                key_holder,
                 dispatch_callback_key0,
                 dispatch_callback_key8,
                 dispatch_callback_key16,
                 dispatch_callback_key24,
-                dispatch_callback_key_str);
+                dispatch_callback_keystr);
+        }
     }
-
-    std::vector<AggregateDataPtr> key0_places;
-    key0_places.reserve(key0_infos.size());
-
-    std::vector<AggregateDataPtr> key8_places;
-    key8_places.reserve(key8_infos.size());
-
-    std::vector<AggregateDataPtr> key16_places;
-    key16_places.reserve(key16_infos.size());
-
-    std::vector<AggregateDataPtr> key24_places;
-    key24_places.reserve(key24_infos.size());
-
-    std::vector<AggregateDataPtr> key_str_places;
-    key_str_places.reserve(key_str_infos.size());
-
-    if (!key0_infos.empty())
+    else
     {
-        emplaceOrFindStringKey<0, false>(method.data, state, key0_infos, key0_datas, aggregates_pool, key0_places, agg_process_info);
-    }
+#define M(INFO, DATA, SUBMAPINDEX)                           \
+    (INFO) = agg_process_info.submap_m##SUBMAPINDEX##_infos; \
+    (DATA) = agg_process_info.submap_m##SUBMAPINDEX##_datas;
 
-#define M(INDEX, INFO, DATA, PLACES) \
-    if (!(INFO).empty()) \
-    { \
-        if constexpr (enable_prefetch) \
-            emplaceOrFindStringKey<INDEX, true>(method.data, state, INFO, DATA, aggregates_pool, PLACES, agg_process_info); \
-        else \
-            emplaceOrFindStringKey<INDEX, true>(method.data, state, INFO, DATA, aggregates_pool, PLACES, agg_process_info); \
+        M(key0_infos, key0_datas, 0)
+        M(key8_infos, key8_datas, 1)
+        M(key16_infos, key16_datas, 2)
+        M(key24_infos, key24_datas, 3)
+        M(key_str_infos, key_str_datas, 4)
+#undef M
     }
 
+    std::vector<AggregateDataPtr> key0_places(key0_infos.size(), nullptr);
+    std::vector<AggregateDataPtr> key8_places(key8_infos.size(), nullptr);
+    std::vector<AggregateDataPtr> key16_places(key16_infos.size(), nullptr);
+    std::vector<AggregateDataPtr> key24_places(key24_infos.size(), nullptr);
+    std::vector<AggregateDataPtr> key_str_places(key_str_infos.size(), nullptr);
+
+    bool got_resize_exception = false;
+    size_t emplaced_index = 0;
+
+#define M(INDEX, INFO, DATA, PLACES)                                                                    \
+    if unlikely (got_resize_exception)                                                                  \
+    {                                                                                                   \
+        emplaced_index = 0;                                                                             \
+    }                                                                                                   \
+    else if (!(INFO).empty())                                                                           \
+    {                                                                                                   \
+        emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch>( \
+            method.data,                                                                                \
+            state,                                                                                      \
+            (INFO),                                                                                     \
+            (DATA),                                                                                     \
+            *aggregates_pool,                                                                           \
+            (PLACES),                                                                                   \
+            agg_process_info);                                                                          \
+        if unlikely (emplaced_index != (INFO).size())                                                   \
+            got_resize_exception = true;                                                                \
+    }                                                                                                   \
+    setupExceptionRecoveryInfoForStringHashTable(                                                       \
+        agg_process_info,                                                                               \
+        emplaced_index,                                                                                 \
+        INFO,                                                                                           \
+        DATA,                                                                                           \
+        std::integral_constant<size_t, INDEX>{});
+
+    M(0, key0_infos, key0_datas, key0_places)
     M(1, key8_infos, key8_datas, key8_places)
     M(2, key16_infos, key16_datas, key16_places)
     M(3, key24_infos, key24_datas, key24_places)
     M(4, key_str_infos, key_str_datas, key_str_places)
 #undef M
 
-    RUNTIME_CHECK(rows == key0_places.size() + key8_places.size() + key16_places.size() + key24_places.size() + key_str_places.size());
+    RUNTIME_CHECK(
+        rows
+        == key0_places.size() + key8_places.size() + key16_places.size() + key24_places.size() + key_str_places.size());
 
     std::vector<AggregateDataPtr> places(rows, nullptr);
-
-#define M(INFO, PLACES) \
+#define M(INFO, PLACES)                        \
     for (size_t i = 0; i < (INFO).size(); ++i) \
-    { \
-        const auto row = (INFO)[i]; \
-        places[row] = (PLACES)[i]; \
+    {                                          \
+        const auto row = (INFO)[i];            \
+        places[row] = (PLACES)[i];             \
     }
 
     M(key0_infos, key0_places)
@@ -1093,7 +1182,6 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     M(key_str_infos, key_str_places)
 #undef M
 
-
     for (AggregateFunctionInstruction * inst = agg_process_info.aggregate_functions_instructions.data(); inst->that;
          ++inst)
     {
@@ -1105,7 +1193,8 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
             inst->batch_arguments,
             aggregates_pool);
     }
-    agg_process_info.start_row = rows;
+    // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used.
+    agg_process_info.start_row = got_resize_exception ? 0 : rows;
 }
 
 void NO_INLINE
@@ -1130,7 +1219,6 @@ Aggregator::executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo
     agg_process_info.start_row += agg_size;
 }
 
-
 void Aggregator::prepareAggregateInstructions(
     Columns columns,
     AggregateColumns & aggregate_columns,
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 6cbbde71b41..7142077c8ea 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1319,11 +1319,28 @@ class Aggregator
         size_t hit_row_cnt = 0;
         std::vector<UInt64> not_found_rows;
 
+        // For StringHashTable resize exception.
+        std::vector<size_t> submap_m0_infos{};
+        std::vector<size_t> submap_m1_infos{};
+        std::vector<size_t> submap_m2_infos{};
+        std::vector<size_t> submap_m3_infos{};
+        std::vector<size_t> submap_m4_infos{};
+
+        std::vector<StringRef> submap_m0_datas{};
+        std::vector<StringKey8> submap_m1_datas{};
+        std::vector<StringKey16> submap_m2_datas{};
+        std::vector<StringKey24> submap_m3_datas{};
+        std::vector<ArenaKeyHolder> submap_m4_datas{};
+
         void prepareForAgg();
         bool allBlockDataHandled() const
         {
             assert(start_row <= end_row);
-            return start_row == end_row || aggregator->isCancelled();
+            // submap_mx_infos.size() and submap_mx_datas.size() are always equal.
+            // So only need to check submap_m0_infos is enough.
+            return (start_row == end_row && !submap_m0_infos.empty() && !submap_m1_infos.empty()
+                    && !submap_m3_infos.empty() && !submap_m4_infos.empty())
+                || aggregator->isCancelled();
         }
         void resetBlock(const Block & block_)
         {
@@ -1463,10 +1480,10 @@ class Aggregator
 
     template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
     void executeImplBatchStringHashMap(
-            Method & method,
-            typename Method::State & state,
-            Arena * aggregates_pool,
-            AggProcessInfo & agg_process_info) const;
+        Method & method,
+        typename Method::State & state,
+        Arena * aggregates_pool,
+        AggProcessInfo & agg_process_info) const;
 
     template <bool only_lookup, bool enable_prefetch, typename Method>
     std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> emplaceOrFindKey(
@@ -1477,15 +1494,22 @@ class Aggregator
         std::vector<std::string> & sort_key_containers,
         const std::vector<size_t> & hashvals) const;
 
-    template <size_t SubMapIndex, bool only_lookup, bool enable_prefetch, typename Method, typename StringKeyType>
-    bool emplaceOrFindStringKey(
-            typename Method::Data & data,
-            typename Method::State & state,
-            const std::vector<size_t> & key_infos,
-            const std::vector<StringKeyType> & key_datas,
-            Arena & aggregates_pool,
-            std::vector<AggregateDataPtr> & places,
-            AggProcessInfo & agg_process_info) const;
+    template <
+        size_t SubMapIndex,
+        bool collect_hit_rate,
+        bool only_lookup,
+        bool enable_prefetch,
+        typename Data,
+        typename State,
+        typename StringKeyType>
+    size_t emplaceOrFindStringKey(
+        Data & data,
+        State & state,
+        const std::vector<size_t> & key_infos,
+        std::vector<StringKeyType> & key_datas,
+        Arena & aggregates_pool,
+        std::vector<AggregateDataPtr> & places,
+        AggProcessInfo & agg_process_info) const;
 
     /// For case when there are no keys (all aggregate into one row).
     static void executeWithoutKeyImpl(AggregatedDataWithoutKey & res, AggProcessInfo & agg_process_info, Arena * arena);

From 9dc702dac5e4d8baba90e127607d5e015562fac3 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Wed, 27 Nov 2024 22:37:37 +0800
Subject: [PATCH 05/24] tmp save

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/Arena.h              |  5 ++
 dbms/src/Common/ColumnsHashing.h     | 80 +++++++++++++++++++------
 dbms/src/Interpreters/Aggregator.cpp | 88 ++++++++++++++++++----------
 dbms/src/Interpreters/Aggregator.h   |  1 +
 4 files changed, 126 insertions(+), 48 deletions(-)

diff --git a/dbms/src/Common/Arena.h b/dbms/src/Common/Arena.h
index b9999f6b179..eb86e1c283c 100644
--- a/dbms/src/Common/Arena.h
+++ b/dbms/src/Common/Arena.h
@@ -212,5 +212,10 @@ class Arena : private boost::noncopyable
 using ArenaPtr = std::shared_ptr<Arena>;
 using Arenas = std::vector<ArenaPtr>;
 
+size_t alignOf16(size_t l)
+{
+    return (l + 15) & ~15;
+}
+
 
 } // namespace DB
diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index aabe0733f8c..0d8e8d60ef7 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -52,7 +52,7 @@ struct HashMethodOneNumber
     const size_t total_rows;
 
     /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
-    HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
+    HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &, Arena *)
         : total_rows(key_columns[0]->size())
     {
         vec = &static_cast<const ColumnVector<FieldType> *>(key_columns[0])->getData()[0];
@@ -107,7 +107,8 @@ struct HashMethodString
     HashMethodString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators)
+        const TiDB::TiDBCollators & collators,
+        Arena *)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -158,7 +159,7 @@ struct HashMethodStringBin
     const UInt8 * chars;
     const size_t total_rows;
 
-    HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
+    HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &, Arena *)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -344,6 +345,43 @@ struct KeyDescStringBinPadding : KeyDescStringBin
     }
 };
 
+void serializeColumnToBuffer(Arena * pool,
+        const ColumnRawPtrs & key_columns,
+        PaddedPODArray<char *> & pos,
+        PaddedPODArray<size_t> & sizes)
+{
+    RUNTIME_CHECK(!key_columns.empty());
+    RUNTIME_CHECK(pos.empty() && sizes.empty());
+
+    const auto rows = key_columns[0]->size();
+    pos.resize(rows, nullptr);
+    sizes.resize(rows, 0);
+
+    for (const auto * col_ptr : key_columns)
+        col_ptr->countSerializeByteSize(sizes);
+
+    std::vector<size_t> aligned_sizes;
+    aligned_sizes.reserve(sizes.size());
+
+    size_t total_byte_size = 0;
+    for (auto size : sizes)
+    {
+        auto aligned = alignOf16(size);
+        total_byte_size += aligned;
+        aligned_sizes.push_back(aligned);
+    }
+
+    auto * buffer = pool->alloc(total_byte_size);
+    for (size_t i = 0; i < aligned_sizes.size(); ++i)
+    {
+        pos[i] = buffer;
+        buffer += aligned_sizes[i];
+    }
+
+    for (const auto * col_ptr : key_columns)
+        col_ptr->serializeToPos(pos, 0, rows, col_ptr->isColumnNullable());
+}
+
 /// For the case when there are 2 keys.
 template <typename Key1Desc, typename Key2Desc, typename Value, typename Mapped>
 struct HashMethodFastPathTwoKeysSerialized
@@ -356,12 +394,16 @@ struct HashMethodFastPathTwoKeysSerialized
     Key1Desc key_1_desc;
     Key2Desc key_2_desc;
     const size_t total_rows;
+    PaddedPODArray<char *> pos;
+    PaddedPODArray<size_t> sizes;
 
-    HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &)
+    HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &, Arena * pool)
         : key_1_desc(key_columns[0])
         , key_2_desc(key_columns[1])
         , total_rows(key_columns[0]->size())
-    {}
+    {
+        serializeColumnToBuffer(pool, key_columns, pos, sizes);
+    }
 
     ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
     {
@@ -400,7 +442,8 @@ struct HashMethodFixedString
     HashMethodFixedString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators)
+        const TiDB::TiDBCollators & collators,
+        Arena *)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -477,7 +520,7 @@ struct HashMethodKeysFixed
         return true;
     }
 
-    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const TiDB::TiDBCollators &)
+    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const TiDB::TiDBCollators &, Arena *)
         : Base(key_columns)
         , key_sizes(std::move(key_sizes_))
         , keys_size(key_columns.size())
@@ -612,25 +655,28 @@ struct HashMethodSerialized
     size_t keys_size;
     TiDB::TiDBCollators collators;
     const size_t total_rows;
+    PaddedPODArray<char *> pos;
+    PaddedPODArray<size_t> sizes;
 
     HashMethodSerialized(
         const ColumnRawPtrs & key_columns_,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators_)
+        const TiDB::TiDBCollators & collators_,
+        Arena * pool)
         : key_columns(key_columns_)
         , keys_size(key_columns_.size())
         , collators(collators_)
         , total_rows(key_columns_[0]->size())
-    {}
+    {
+        serializeColumnToBuffer(pool, key_columns_, pos, sizes);
+    }
 
-    ALWAYS_INLINE inline SerializedKeyHolder getKeyHolder(
-        size_t row,
-        Arena * pool,
-        std::vector<String> & sort_key_containers) const
+    ALWAYS_INLINE inline StringRef getKeyHolder(
+            size_t row,
+            Arena *,
+            std::vector<String> &) const
     {
-        return SerializedKeyHolder{
-            serializeKeysToPoolContiguous(row, keys_size, key_columns, collators, sort_key_containers, *pool),
-            *pool};
+        return StringRef(pos[row], sizes[row]);
     }
 
 protected:
@@ -650,7 +696,7 @@ struct HashMethodHashed
     TiDB::TiDBCollators collators;
     const size_t total_rows;
 
-    HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const TiDB::TiDBCollators & collators_)
+    HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const TiDB::TiDBCollators & collators_, Arena *)
         : key_columns(std::move(key_columns_))
         , collators(collators_)
         , total_rows(key_columns[0]->size())
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 4faec37ce9d..1738121b665 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -741,6 +741,7 @@ template <
     bool collect_hit_rate,
     bool only_lookup,
     bool enable_prefetch,
+    bool zero_agg_func_size,
     typename Data,
     typename State,
     typename StringKeyType>
@@ -763,7 +764,8 @@ size_t Aggregator::emplaceOrFindStringKey(
         hashvals[i] = Hash::operator()(keyHolderGetKey(key_datas[0]));
     }
 
-    AggregateDataPtr agg_state = nullptr;
+    // alloc 0 bytes is useful when agg func size is zero.
+    AggregateDataPtr agg_state = aggregates_pool.alloc(0);
     for (size_t i = 0; i < key_infos.size(); ++i)
     {
         try
@@ -787,21 +789,31 @@ size_t Aggregator::emplaceOrFindStringKey(
                     = state.template emplaceStringKey<SubMapIndex, enable_prefetch>(data, i, key_datas, hashvals);
                 if (emplace_result.isInserted())
                 {
-                    emplace_result.setMapped(nullptr);
+                    if constexpr (zero_agg_func_size)
+                    {
+                        emplace_result.setMapped(agg_state);
+                    }
+                    else
+                    {
+                        emplace_result.setMapped(nullptr);
 
-                    agg_state = aggregates_pool.alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                    createAggregateStates(agg_state);
+                        agg_state
+                            = aggregates_pool.alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                        createAggregateStates(agg_state);
 
-                    emplace_result.setMapped(agg_state);
+                        emplace_result.setMapped(agg_state);
+                    }
                 }
                 else
                 {
-                    agg_state = emplace_result.getMapped();
+                    if constexpr (!zero_agg_func_size)
+                        agg_state = emplace_result.getMapped();
 
                     if constexpr (collect_hit_rate)
                         ++agg_process_info.hit_row_cnt;
                 }
-                places[i] = agg_state;
+                if constexpr (!zero_agg_func_size)
+                    places[i] = agg_state;
             }
         }
         catch (ResizeException &)
@@ -1130,30 +1142,41 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
 
     bool got_resize_exception = false;
     size_t emplaced_index = 0;
-
-#define M(INDEX, INFO, DATA, PLACES)                                                                    \
-    if unlikely (got_resize_exception)                                                                  \
-    {                                                                                                   \
-        emplaced_index = 0;                                                                             \
-    }                                                                                                   \
-    else if (!(INFO).empty())                                                                           \
-    {                                                                                                   \
-        emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch>( \
-            method.data,                                                                                \
-            state,                                                                                      \
-            (INFO),                                                                                     \
-            (DATA),                                                                                     \
-            *aggregates_pool,                                                                           \
-            (PLACES),                                                                                   \
-            agg_process_info);                                                                          \
-        if unlikely (emplaced_index != (INFO).size())                                                   \
-            got_resize_exception = true;                                                                \
-    }                                                                                                   \
-    setupExceptionRecoveryInfoForStringHashTable(                                                       \
-        agg_process_info,                                                                               \
-        emplaced_index,                                                                                 \
-        INFO,                                                                                           \
-        DATA,                                                                                           \
+    bool zero_agg_func_size = (params.aggregates_size == 0);
+
+#define M(INDEX, INFO, DATA, PLACES)                                                                               \
+    if unlikely (got_resize_exception)                                                                             \
+    {                                                                                                              \
+        emplaced_index = 0;                                                                                        \
+    }                                                                                                              \
+    else if (!(INFO).empty())                                                                                      \
+    {                                                                                                              \
+        if (zero_agg_func_size)                                                                                    \
+            emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch, true>(  \
+                method.data,                                                                                       \
+                state,                                                                                             \
+                (INFO),                                                                                            \
+                (DATA),                                                                                            \
+                *aggregates_pool,                                                                                  \
+                (PLACES),                                                                                          \
+                agg_process_info);                                                                                 \
+        else                                                                                                       \
+            emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch, false>( \
+                method.data,                                                                                       \
+                state,                                                                                             \
+                (INFO),                                                                                            \
+                (DATA),                                                                                            \
+                *aggregates_pool,                                                                                  \
+                (PLACES),                                                                                          \
+                agg_process_info);                                                                                 \
+        if unlikely (emplaced_index != (INFO).size())                                                              \
+            got_resize_exception = true;                                                                           \
+    }                                                                                                              \
+    setupExceptionRecoveryInfoForStringHashTable(                                                                  \
+        agg_process_info,                                                                                          \
+        emplaced_index,                                                                                            \
+        INFO,                                                                                                      \
+        DATA,                                                                                                      \
         std::integral_constant<size_t, INDEX>{});
 
     M(0, key0_infos, key0_datas, key0_places)
@@ -1163,6 +1186,9 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     M(4, key_str_infos, key_str_datas, key_str_places)
 #undef M
 
+    if (zero_agg_func_size)
+        return;
+
     RUNTIME_CHECK(
         rows
         == key0_places.size() + key8_places.size() + key16_places.size() + key24_places.size() + key_str_places.size());
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 7142077c8ea..cc2dcd2a408 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1499,6 +1499,7 @@ class Aggregator
         bool collect_hit_rate,
         bool only_lookup,
         bool enable_prefetch,
+        bool zero_agg_func_size,
         typename Data,
         typename State,
         typename StringKeyType>

From ce1f76754e7a454d380c98c9330273481d170556 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Thu, 28 Nov 2024 16:29:05 +0800
Subject: [PATCH 06/24] revert Serialized Key changes

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/Arena.h          |  5 --
 dbms/src/Common/ColumnsHashing.h | 80 +++++++-------------------------
 2 files changed, 17 insertions(+), 68 deletions(-)

diff --git a/dbms/src/Common/Arena.h b/dbms/src/Common/Arena.h
index eb86e1c283c..b9999f6b179 100644
--- a/dbms/src/Common/Arena.h
+++ b/dbms/src/Common/Arena.h
@@ -212,10 +212,5 @@ class Arena : private boost::noncopyable
 using ArenaPtr = std::shared_ptr<Arena>;
 using Arenas = std::vector<ArenaPtr>;
 
-size_t alignOf16(size_t l)
-{
-    return (l + 15) & ~15;
-}
-
 
 } // namespace DB
diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index 0d8e8d60ef7..aabe0733f8c 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -52,7 +52,7 @@ struct HashMethodOneNumber
     const size_t total_rows;
 
     /// If the keys of a fixed length then key_sizes contains their lengths, empty otherwise.
-    HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &, Arena *)
+    HashMethodOneNumber(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
         : total_rows(key_columns[0]->size())
     {
         vec = &static_cast<const ColumnVector<FieldType> *>(key_columns[0])->getData()[0];
@@ -107,8 +107,7 @@ struct HashMethodString
     HashMethodString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators,
-        Arena *)
+        const TiDB::TiDBCollators & collators)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -159,7 +158,7 @@ struct HashMethodStringBin
     const UInt8 * chars;
     const size_t total_rows;
 
-    HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &, Arena *)
+    HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -345,43 +344,6 @@ struct KeyDescStringBinPadding : KeyDescStringBin
     }
 };
 
-void serializeColumnToBuffer(Arena * pool,
-        const ColumnRawPtrs & key_columns,
-        PaddedPODArray<char *> & pos,
-        PaddedPODArray<size_t> & sizes)
-{
-    RUNTIME_CHECK(!key_columns.empty());
-    RUNTIME_CHECK(pos.empty() && sizes.empty());
-
-    const auto rows = key_columns[0]->size();
-    pos.resize(rows, nullptr);
-    sizes.resize(rows, 0);
-
-    for (const auto * col_ptr : key_columns)
-        col_ptr->countSerializeByteSize(sizes);
-
-    std::vector<size_t> aligned_sizes;
-    aligned_sizes.reserve(sizes.size());
-
-    size_t total_byte_size = 0;
-    for (auto size : sizes)
-    {
-        auto aligned = alignOf16(size);
-        total_byte_size += aligned;
-        aligned_sizes.push_back(aligned);
-    }
-
-    auto * buffer = pool->alloc(total_byte_size);
-    for (size_t i = 0; i < aligned_sizes.size(); ++i)
-    {
-        pos[i] = buffer;
-        buffer += aligned_sizes[i];
-    }
-
-    for (const auto * col_ptr : key_columns)
-        col_ptr->serializeToPos(pos, 0, rows, col_ptr->isColumnNullable());
-}
-
 /// For the case when there are 2 keys.
 template <typename Key1Desc, typename Key2Desc, typename Value, typename Mapped>
 struct HashMethodFastPathTwoKeysSerialized
@@ -394,16 +356,12 @@ struct HashMethodFastPathTwoKeysSerialized
     Key1Desc key_1_desc;
     Key2Desc key_2_desc;
     const size_t total_rows;
-    PaddedPODArray<char *> pos;
-    PaddedPODArray<size_t> sizes;
 
-    HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &, Arena * pool)
+    HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &)
         : key_1_desc(key_columns[0])
         , key_2_desc(key_columns[1])
         , total_rows(key_columns[0]->size())
-    {
-        serializeColumnToBuffer(pool, key_columns, pos, sizes);
-    }
+    {}
 
     ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
     {
@@ -442,8 +400,7 @@ struct HashMethodFixedString
     HashMethodFixedString(
         const ColumnRawPtrs & key_columns,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators,
-        Arena *)
+        const TiDB::TiDBCollators & collators)
         : total_rows(key_columns[0]->size())
     {
         const IColumn & column = *key_columns[0];
@@ -520,7 +477,7 @@ struct HashMethodKeysFixed
         return true;
     }
 
-    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const TiDB::TiDBCollators &, Arena *)
+    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const TiDB::TiDBCollators &)
         : Base(key_columns)
         , key_sizes(std::move(key_sizes_))
         , keys_size(key_columns.size())
@@ -655,28 +612,25 @@ struct HashMethodSerialized
     size_t keys_size;
     TiDB::TiDBCollators collators;
     const size_t total_rows;
-    PaddedPODArray<char *> pos;
-    PaddedPODArray<size_t> sizes;
 
     HashMethodSerialized(
         const ColumnRawPtrs & key_columns_,
         const Sizes & /*key_sizes*/,
-        const TiDB::TiDBCollators & collators_,
-        Arena * pool)
+        const TiDB::TiDBCollators & collators_)
         : key_columns(key_columns_)
         , keys_size(key_columns_.size())
         , collators(collators_)
         , total_rows(key_columns_[0]->size())
-    {
-        serializeColumnToBuffer(pool, key_columns_, pos, sizes);
-    }
+    {}
 
-    ALWAYS_INLINE inline StringRef getKeyHolder(
-            size_t row,
-            Arena *,
-            std::vector<String> &) const
+    ALWAYS_INLINE inline SerializedKeyHolder getKeyHolder(
+        size_t row,
+        Arena * pool,
+        std::vector<String> & sort_key_containers) const
     {
-        return StringRef(pos[row], sizes[row]);
+        return SerializedKeyHolder{
+            serializeKeysToPoolContiguous(row, keys_size, key_columns, collators, sort_key_containers, *pool),
+            *pool};
     }
 
 protected:
@@ -696,7 +650,7 @@ struct HashMethodHashed
     TiDB::TiDBCollators collators;
     const size_t total_rows;
 
-    HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const TiDB::TiDBCollators & collators_, Arena *)
+    HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const TiDB::TiDBCollators & collators_)
         : key_columns(std::move(key_columns_))
         , collators(collators_)
         , total_rows(key_columns[0]->size())

From 8ac8bebe1280ba49a570fb20cef9da78c4d0454f Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Thu, 28 Nov 2024 17:50:30 +0800
Subject: [PATCH 07/24] refine

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h          | 38 ++++++++--------
 dbms/src/Common/HashTable/StringHashTable.h   | 44 +++----------------
 .../HashTable/TwoLevelStringHashTable.h       | 31 +------------
 dbms/src/Interpreters/Aggregator.h            |  9 ++--
 4 files changed, 31 insertions(+), 91 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index aa583f1a722..ffaffdcd758 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -130,6 +130,14 @@ class HashMethodBase
     using Cache = LastElementCache<Value, consecutive_keys_optimization>;
     static constexpr size_t prefetch_step = 16;
 
+    template <typename Map>
+    static ALWAYS_INLINE inline void prefetch(Map & map, size_t idx, const std::vector<size_t> & hashvals)
+    {
+        const auto prefetch_idx = idx + prefetch_step;
+        if likely (prefetch_idx < hashvals.size())
+            map.prefetch(hashvals[prefetch_idx]);
+    }
+
     template <bool enable_prefetch = false, typename Data>
     ALWAYS_INLINE inline EmplaceResult emplaceKey(
         Data & data,
@@ -141,10 +149,8 @@ class HashMethodBase
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         if constexpr (enable_prefetch)
         {
-            const auto idx = row + prefetch_step;
-            if (idx < hashvals.size())
-                data.prefetch(hashvals[idx]);
-
+            assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+            prefetch(data, row, hashvals);
             return emplaceImpl<true>(key_holder, data, hashvals[row]);
         }
         else
@@ -164,10 +170,8 @@ class HashMethodBase
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         if constexpr (enable_prefetch)
         {
-            const auto idx = row + prefetch_step;
-            if (idx < hashvals.size())
-                data.prefetch(hashvals[idx]);
-
+            assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+            prefetch(data, row, hashvals);
             return findKeyImpl<true>(keyHolderGetKey(key_holder), data, hashvals[row]);
         }
         else
@@ -176,7 +180,6 @@ class HashMethodBase
         }
     }
 
-    // TODO emplaceStringKey merge with emplaceKey?
     template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
     ALWAYS_INLINE inline EmplaceResult emplaceStringKey(
         Data & data,
@@ -184,20 +187,17 @@ class HashMethodBase
         std::vector<StringKeyType> & datas, // TODO const
         const std::vector<size_t> & hashvals)
     {
+        assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+
         auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
             hashvals[idx],
             data);
         if constexpr (enable_prefetch)
-        {
-            const auto prefetch_idx = idx + prefetch_step;
-            if (prefetch_idx < hashvals.size())
-                submap.prefetch(hashvals[prefetch_idx]);
-        }
+            prefetch(submap, idx, hashvals);
 
         return emplaceImpl<true>(datas[idx], submap, hashvals[idx]);
     }
 
-    // TODO Macro with emplaceStringKey
     template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
     ALWAYS_INLINE inline FindResult findStringKey(
         Data & data,
@@ -205,15 +205,13 @@ class HashMethodBase
         std::vector<StringKeyType> & datas, // TODO const
         const std::vector<size_t> & hashvals)
     {
+        assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+
         auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
             hashvals[idx],
             data);
         if constexpr (enable_prefetch)
-        {
-            const auto prefetch_idx = idx + prefetch_step;
-            if (prefetch_idx < hashvals.size())
-                submap.prefetch(hashvals[prefetch_idx]);
-        }
+            prefetch(submap, idx, hashvals);
 
         return findKeyImpl<true>(keyHolderGetKey(datas[idx]), submap, hashvals[idx]);
     }
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index a511ce47671..ef668864120 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -139,8 +139,7 @@ struct StringHashTableEmpty //-V730
         return hasZero() ? zeroValue() : nullptr;
     }
 
-    void ALWAYS_INLINE prefetch(size_t) {}
-
+    ALWAYS_INLINE inline void prefetch() {}
     void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
     void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
     void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
@@ -348,7 +347,6 @@ class StringHashTable : private boost::noncopyable
 #endif
         dispatch(Self & self, KeyHolder && key_holder, Func && func)
     {
-        StringHashTableHash hash;
         const StringRef & x = keyHolderGetKey(key_holder);
         const size_t sz = x.size;
         if (sz == 0)
@@ -361,7 +359,7 @@ class StringHashTable : private boost::noncopyable
         {
             // Strings with trailing zeros are not representable as fixed-size
             // string keys. Put them to the generic table.
-            return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+            return func(self.ms, std::forward<KeyHolder>(key_holder), StringHashTableHash::operator()(x));
         }
 
         const char * p = x.data;
@@ -397,7 +395,7 @@ class StringHashTable : private boost::noncopyable
                     n[0] <<= s;
             }
             keyHolderDiscardKey(key_holder);
-            return func(self.m1, k8, hash(k8));
+            return func(self.m1, k8, StringHashTableHash::operator()(k8));
         }
         case 1: // 9..16 bytes
         {
@@ -409,7 +407,7 @@ class StringHashTable : private boost::noncopyable
             else
                 n[1] <<= s;
             keyHolderDiscardKey(key_holder);
-            return func(self.m2, k16, hash(k16));
+            return func(self.m2, k16, StringHashTableHash::operator()(k16));
         }
         case 2: // 17..24 bytes
         {
@@ -421,11 +419,11 @@ class StringHashTable : private boost::noncopyable
             else
                 n[2] <<= s;
             keyHolderDiscardKey(key_holder);
-            return func(self.m3, k24, hash(k24));
+            return func(self.m3, k24, StringHashTableHash::operator()(k24));
         }
         default: // >= 25 bytes
         {
-            return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
+            return func(self.ms, std::forward<KeyHolder>(key_holder), StringHashTableHash::operator()(x));
         }
         }
     }
@@ -455,13 +453,6 @@ class StringHashTable : private boost::noncopyable
         this->dispatch(*this, key_holder, EmplaceCallable(it, inserted));
     }
 
-    // TODO del
-    template <typename KeyHolder>
-    void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult &, bool &, size_t)
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::emplace instead");
-    }
-
     struct FindCallable
     {
         // find() doesn't need any key memory management, so we don't work with
@@ -478,35 +469,12 @@ class StringHashTable : private boost::noncopyable
         }
     };
 
-    // We will not prefetch StringHashTable directly, instead caller should call specific submap's prefetch.
-    // Because StringHashTable doesn't know which submap to prefetch.
-    void prefetch(size_t) const
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::prefetch instead");
-    }
-
     LookupResult ALWAYS_INLINE find(const Key & x) { return dispatch(*this, x, FindCallable{}); }
 
     ConstLookupResult ALWAYS_INLINE find(const Key & x) const { return dispatch(*this, x, FindCallable{}); }
 
-    // TODO del
-    LookupResult ALWAYS_INLINE find(const Key &, size_t)
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
-    }
-    ConstLookupResult ALWAYS_INLINE find(const Key &, size_t) const
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
-    }
-
     bool ALWAYS_INLINE has(const Key & x, size_t = 0) const { return dispatch(*this, x, FindCallable{}) != nullptr; }
 
-    template <typename HashKeyType>
-    size_t ALWAYS_INLINE hash(const HashKeyType & key) const
-    {
-        return SubMaps::Hash::operator()(key);
-    }
-
     void write(DB::WriteBuffer & wb) const
     {
         m0.write(wb);
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index e7ea1bb8fce..5ea460769ab 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -33,20 +33,8 @@ class TwoLevelStringHashTable : private boost::noncopyable
     static constexpr bool is_string_hash_map = true;
     static constexpr bool is_two_level = true;
 
-    template <typename HashKeyType>
-    size_t ALWAYS_INLINE hash(const HashKeyType & key) const
-    {
-        return SubMaps::Hash::operator()(key);
-    }
-
-    // Same reason as StringHashTable::prefetch.
-    void prefetch(size_t) const
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::prefetch instead");
-    }
-
     // TODO: currently hashing contains redundant computations when doing distributed or external aggregations
-    size_t hashStringRef(const Key & x) const
+    size_t hash(const Key & x) const
     {
         return const_cast<Self &>(*this).dispatch(*this, x, [&](const auto &, const auto &, size_t hash) {
             return hash;
@@ -59,7 +47,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
             impl.setResizeCallback(resize_callback);
     }
 
-    size_t operator()(const Key & x) const { return hashStringRef(x); }
+    size_t operator()(const Key & x) const { return hash(x); }
 
     /// NOTE Bad for hash tables with more than 2^32 cells.
     static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
@@ -216,27 +204,12 @@ class TwoLevelStringHashTable : private boost::noncopyable
         dispatch(*this, key_holder, typename Impl::EmplaceCallable{it, inserted});
     }
 
-    template <typename KeyHolder>
-    void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult &, bool &, size_t)
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::emplace instead");
-    }
-
     LookupResult ALWAYS_INLINE find(const Key & x) { return dispatch(*this, x, typename Impl::FindCallable{}); }
 
     ConstLookupResult ALWAYS_INLINE find(const Key & x) const
     {
         return dispatch(*this, x, typename Impl::FindCallable{});
     }
-    LookupResult ALWAYS_INLINE find(const Key &, size_t)
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
-    }
-
-    ConstLookupResult ALWAYS_INLINE find(const Key &, size_t) const
-    {
-        RUNTIME_CHECK_MSG(false, "shouldn't reach here, you should use submap::find instead");
-    }
 
     void write(DB::WriteBuffer & wb) const
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index cc2dcd2a408..53bc989dcbc 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1319,7 +1319,8 @@ class Aggregator
         size_t hit_row_cnt = 0;
         std::vector<UInt64> not_found_rows;
 
-        // For StringHashTable resize exception.
+        // For StringHashTable, when resize exception happens, the process will be interrupted.
+        // So we need these infos to continue.
         std::vector<size_t> submap_m0_infos{};
         std::vector<size_t> submap_m1_infos{};
         std::vector<size_t> submap_m2_infos{};
@@ -1337,9 +1338,9 @@ class Aggregator
         {
             assert(start_row <= end_row);
             // submap_mx_infos.size() and submap_mx_datas.size() are always equal.
-            // So only need to check submap_m0_infos is enough.
-            return (start_row == end_row && !submap_m0_infos.empty() && !submap_m1_infos.empty()
-                    && !submap_m3_infos.empty() && !submap_m4_infos.empty())
+            // So only need to check submap_mx_infos is enough.
+            return (start_row == end_row && submap_m0_infos.empty() && submap_m1_infos.empty()
+                    && submap_m3_infos.empty() && submap_m4_infos.empty())
                 || aggregator->isCancelled();
         }
         void resetBlock(const Block & block_)

From 0053ce8a82f19cd8772ce5f00cea160b20934250 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Thu, 28 Nov 2024 18:03:50 +0800
Subject: [PATCH 08/24] refine

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/HashTable/StringHashTable.h |  2 +-
 dbms/src/Interpreters/Aggregator.cpp        | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index ef668864120..fde0f810ae6 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -139,7 +139,7 @@ struct StringHashTableEmpty //-V730
         return hasZero() ? zeroValue() : nullptr;
     }
 
-    ALWAYS_INLINE inline void prefetch() {}
+    ALWAYS_INLINE inline void prefetch(size_t) {}
     void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
     void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
     void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 1738121b665..d8210fc9a9c 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -1090,13 +1090,14 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
             "executeImplBatchStringHashMap only handle resize exception for each Block instead of row");
         const size_t reserve_size = rows / 4;
 
-#define M(INFO, DATA, SUBMAPINDEX, KEYTYPE)                                                     \
-    (INFO).reserve(reserve_size);                                                               \
-    (DATA).reserve(reserve_size);                                                               \
-    auto dispatch_callback_key##SUBMAPINDEX = [&INFO, &DATA](const KEYTYPE & key, size_t row) { \
-        (INFO).push_back(row);                                                                  \
-        (DATA).push_back(key);                                                                  \
-    };
+#define M(INFO, DATA, SUBMAPINDEX, KEYTYPE)                                                          \
+    (INFO).reserve(reserve_size);                                                                    \
+    (DATA).reserve(reserve_size);                                                                    \
+    auto dispatch_callback_key##SUBMAPINDEX                                                          \
+        = [&INFO, &DATA](const KEYTYPE & key, size_t row) { /* NOLINT(bugprone-macro-parentheses) */ \
+                                                            (INFO).push_back(row);                   \
+                                                            (DATA).push_back(key);                   \
+          };
 
         M(key0_infos, key0_datas, 0, StringRef)
         M(key8_infos, key8_datas, 8, StringKey8)

From fcf8ed2b49ebc8e7d3c8a0c44e035bc4477c5663 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Fri, 29 Nov 2024 17:18:00 +0800
Subject: [PATCH 09/24] fix unit test

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h |  6 ++++--
 dbms/src/Interpreters/Aggregator.cpp | 30 ++++++++++++----------------
 dbms/src/Interpreters/Aggregator.h   |  9 ++++++---
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index ffaffdcd758..1f4e3dbaedf 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -187,7 +187,9 @@ class HashMethodBase
         std::vector<StringKeyType> & datas, // TODO const
         const std::vector<size_t> & hashvals)
     {
-        assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+        // For spill, hashvals.size() will be le to total_rows.
+        // Because only remaining rows that didn't insert into HashMap will be handled here.
+        assert(hashvals.size() <= static_cast<Derived &>(*this).total_rows);
 
         auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
             hashvals[idx],
@@ -205,7 +207,7 @@ class HashMethodBase
         std::vector<StringKeyType> & datas, // TODO const
         const std::vector<size_t> & hashvals)
     {
-        assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
+        assert(hashvals.size() <= static_cast<Derived &>(*this).total_rows);
 
         auto & submap = StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::getSubMap(
             hashvals[idx],
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index d8210fc9a9c..041f77fc155 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -760,9 +760,7 @@ size_t Aggregator::emplaceOrFindStringKey(
     using Hash = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::Hash;
     std::vector<size_t> hashvals(key_infos.size(), 0);
     for (size_t i = 0; i < key_infos.size(); ++i)
-    {
-        hashvals[i] = Hash::operator()(keyHolderGetKey(key_datas[0]));
-    }
+        hashvals[i] = Hash::operator()(keyHolderGetKey(key_datas[i]));
 
     // alloc 0 bytes is useful when agg func size is zero.
     AggregateDataPtr agg_state = aggregates_pool.alloc(0);
@@ -1080,14 +1078,12 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
 #undef M
 
     const size_t rows = agg_process_info.end_row - agg_process_info.start_row;
+    // If no resize exception happens, so this is a new Block.
+    // If resize exception happens, start_row also set as zero.
+    RUNTIME_CHECK(agg_process_info.start_row == 0);
 
-    if likely (agg_process_info.allBlockDataHandled())
+    if likely (agg_process_info.stringHashTableRecoveryInfoEmpty())
     {
-        // No resize exception happens, so this is a new Block.
-        RUNTIME_CHECK(agg_process_info.start_row == 0);
-        RUNTIME_CHECK_MSG(
-            rows == state.total_rows,
-            "executeImplBatchStringHashMap only handle resize exception for each Block instead of row");
         const size_t reserve_size = rows / 4;
 
 #define M(INFO, DATA, SUBMAPINDEX, KEYTYPE)                                                          \
@@ -1146,11 +1142,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     bool zero_agg_func_size = (params.aggregates_size == 0);
 
 #define M(INDEX, INFO, DATA, PLACES)                                                                               \
-    if unlikely (got_resize_exception)                                                                             \
-    {                                                                                                              \
-        emplaced_index = 0;                                                                                        \
-    }                                                                                                              \
-    else if (!(INFO).empty())                                                                                      \
+    if (!(INFO).empty())                                                                                      \
     {                                                                                                              \
         if (zero_agg_func_size)                                                                                    \
             emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch, true>(  \
@@ -1173,11 +1165,15 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
         if unlikely (emplaced_index != (INFO).size())                                                              \
             got_resize_exception = true;                                                                           \
     }                                                                                                              \
+    else \
+    { \
+        emplaced_index = 0; \
+    } \
     setupExceptionRecoveryInfoForStringHashTable(                                                                  \
         agg_process_info,                                                                                          \
         emplaced_index,                                                                                            \
-        INFO,                                                                                                      \
-        DATA,                                                                                                      \
+        (INFO),                                                                                                      \
+        (DATA),                                                                                                      \
         std::integral_constant<size_t, INDEX>{});
 
     M(0, key0_infos, key0_datas, key0_places)
@@ -1221,7 +1217,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
             aggregates_pool);
     }
     // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used.
-    agg_process_info.start_row = got_resize_exception ? 0 : rows;
+    agg_process_info.start_row = got_resize_exception ? 0 : agg_process_info.end_row;
 }
 
 void NO_INLINE
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 53bc989dcbc..f5217058ff8 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1339,9 +1339,12 @@ class Aggregator
             assert(start_row <= end_row);
             // submap_mx_infos.size() and submap_mx_datas.size() are always equal.
             // So only need to check submap_mx_infos is enough.
-            return (start_row == end_row && submap_m0_infos.empty() && submap_m1_infos.empty()
-                    && submap_m3_infos.empty() && submap_m4_infos.empty())
-                || aggregator->isCancelled();
+            return (start_row == end_row && stringHashTableRecoveryInfoEmpty()) || aggregator->isCancelled();
+        }
+        bool stringHashTableRecoveryInfoEmpty() const
+        {
+            return submap_m0_infos.empty() && submap_m1_infos.empty() &&
+                submap_m3_infos.empty() && submap_m4_infos.empty();
         }
         void resetBlock(const Block & block_)
         {

From ae7b969f3f83b9e6dd88f99b0bb478926895be56 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Mon, 2 Dec 2024 15:13:55 +0800
Subject: [PATCH 10/24] refine

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashing.h            |  90 ++++++++--------
 dbms/src/Common/HashTable/StringHashTable.h |   2 +
 dbms/src/Interpreters/Aggregator.cpp        |  45 ++++----
 dbms/src/Interpreters/Aggregator.h          |  13 +--
 dbms/src/Interpreters/JoinPartition.cpp     |  28 ++---
 dbms/src/Interpreters/SetVariants.h         |   4 +-
 dbms/src/TiDB/Collation/Collator.cpp        | 112 ++++++++++++++++++++
 dbms/src/TiDB/Collation/Collator.h          |  10 ++
 8 files changed, 221 insertions(+), 83 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index aabe0733f8c..94526714250 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -91,12 +91,11 @@ struct HashMethodOneNumber
 
 
 /// For the case when there is one string key.
-template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
+template <typename Value, typename Mapped, bool use_cache = true>
 struct HashMethodString
-    : public columns_hashing_impl::
-          HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache>, Value, Mapped, use_cache>
+    : public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, use_cache>, Value, Mapped, use_cache>
 {
-    using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache>;
+    using Self = HashMethodString<Value, Mapped, use_cache>;
     using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
 
     const IColumn::Offset * offsets;
@@ -115,36 +114,40 @@ struct HashMethodString
         offsets = column_string.getOffsets().data();
         chars = column_string.getChars().data();
         if (!collators.empty())
-        {
-            if constexpr (!place_string_to_arena)
-                throw Exception("String with collator must be placed on arena.", ErrorCodes::LOGICAL_ERROR);
             collator = collators[0];
-        }
     }
 
-    ALWAYS_INLINE inline auto getKeyHolder(
+    ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(
         ssize_t row,
         [[maybe_unused]] Arena * pool,
-        std::vector<String> & sort_key_containers) const
+        [[maybe_unused]] std::vector<String> & sort_key_containers) const
     {
-        auto last_offset = row == 0 ? 0 : offsets[row - 1];
-        // Remove last zero byte.
-        StringRef key(chars + last_offset, offsets[row] - last_offset - 1);
+        auto key = getKey(row);
+        if (likely(collator))
+            key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
 
-        if constexpr (place_string_to_arena)
-        {
-            if (likely(collator))
-                key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
-            return ArenaKeyHolder{key, pool};
-        }
-        else
-        {
-            return key;
-        }
+        return ArenaKeyHolder{key, pool};
+    }
+
+    ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(ssize_t row, Arena * pool, Arena * sort_key_pool) const
+    {
+        auto key = getKey(row);
+        if (likely(collator))
+            key = collator->sortKey(key.data, key.size, *sort_key_pool);
+
+        return ArenaKeyHolder{key, pool};
     }
 
 protected:
     friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+
+private:
+    ALWAYS_INLINE inline StringRef getKey(size_t row) const
+    {
+        auto last_offset = row == 0 ? 0 : offsets[row - 1];
+        // Remove last zero byte.
+        return StringRef(chars + last_offset, offsets[row] - last_offset - 1);
+    }
 };
 
 template <typename Value, typename Mapped, bool padding>
@@ -168,6 +171,11 @@ struct HashMethodStringBin
     }
 
     ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
+    {
+        return getKeyHolder(row, pool, nullptr);
+    }
+
+    ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, Arena *) const
     {
         auto last_offset = row == 0 ? 0 : offsets[row - 1];
         StringRef key(chars + last_offset, offsets[row] - last_offset - 1);
@@ -381,15 +389,12 @@ struct HashMethodFastPathTwoKeysSerialized
 
 
 /// For the case when there is one fixed-length string key.
-template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
+template <typename Value, typename Mapped, bool use_cache = true>
 struct HashMethodFixedString
-    : public columns_hashing_impl::HashMethodBase<
-          HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>,
-          Value,
-          Mapped,
-          use_cache>
+    : public columns_hashing_impl::
+          HashMethodBase<HashMethodFixedString<Value, Mapped, use_cache>, Value, Mapped, use_cache>
 {
-    using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>;
+    using Self = HashMethodFixedString<Value, Mapped, use_cache>;
     using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
 
     size_t n;
@@ -411,26 +416,25 @@ struct HashMethodFixedString
             collator = collators[0];
     }
 
-    ALWAYS_INLINE inline auto getKeyHolder(
+    ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(
         size_t row,
-        [[maybe_unused]] Arena * pool,
+        Arena * pool,
         std::vector<String> & sort_key_containers) const
     {
         StringRef key(&(*chars)[row * n], n);
-
         if (collator)
-        {
             key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]);
-        }
 
-        if constexpr (place_string_to_arena)
-        {
-            return ArenaKeyHolder{key, pool};
-        }
-        else
-        {
-            return key;
-        }
+        return ArenaKeyHolder{key, pool};
+    }
+
+    ALWAYS_INLINE inline ArenaKeyHolder getKeyHolder(size_t row, Arena * pool, Arena * sort_key_pool) const
+    {
+        StringRef key(&(*chars)[row * n], n);
+        if (collator)
+            key = collator->sortKeyFastPath(key.data, key.size, *sort_key_pool);
+
+        return ArenaKeyHolder{key, pool};
     }
 
 protected:
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index fde0f810ae6..a43f35fdbbf 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -16,7 +16,9 @@
 
 #include <Common/HashTable/HashMap.h>
 #include <Common/HashTable/HashTable.h>
+#include <Common/Logger.h>
 #include <IO/Endian.h>
+#include <common/logger_useful.h>
 
 #include <new>
 #include <variant>
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 041f77fc155..543885b6248 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -755,7 +755,7 @@ size_t Aggregator::emplaceOrFindStringKey(
     AggProcessInfo & agg_process_info) const
 {
     static_assert(!(collect_hit_rate && only_lookup));
-    RUNTIME_CHECK(key_infos.size() == key_datas.size());
+    assert(key_infos.size() == key_datas.size());
 
     using Hash = typename StringHashTableSubMapSelector<SubMapIndex, Data::is_two_level, std::decay_t<Data>>::Hash;
     std::vector<size_t> hashvals(key_infos.size(), 0);
@@ -1017,7 +1017,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
 
 #define M(SUBMAPINDEX)                                                              \
     template <typename StringKeyType>                                               \
-    void setupExceptionRecoveryInfoForStringHashTable(                              \
+    ALWAYS_INLINE inline void setupExceptionRecoveryInfoForStringHashTable(         \
         Aggregator::AggProcessInfo & agg_process_info,                              \
         size_t row,                                                                 \
         const std::vector<size_t> & key_infos,                                      \
@@ -1038,8 +1038,10 @@ M(4)
 
 #undef M
 
-// Emplace key into StringHashMap/TwoLevelStringHashMap is seperated from other situations,
-// because it's easy to implement prefetch submap directly.
+// In this function, we will prefetch/empalce each specifix submap directly instead of accessing StringHashMap interface,
+// which is good for performance.
+// NOTE: this function is column-wise, which means sort key buffer cannot be reused.
+// This buffer will not be release until this block is processed done.
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
 ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     Method & method,
@@ -1063,8 +1065,9 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     M(4)
 #undef M
 
+    const size_t rows = agg_process_info.end_row - agg_process_info.start_row;
+    auto sort_key_pool = std::make_unique<Arena>();
     std::vector<String> sort_key_containers;
-    sort_key_containers.resize(params.keys_size, "");
 
 #define M(INFO, DATA, KEYTYPE) \
     std::vector<size_t>(INFO); \
@@ -1077,13 +1080,15 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     M(key_str_infos, key_str_datas, ArenaKeyHolder)
 #undef M
 
-    const size_t rows = agg_process_info.end_row - agg_process_info.start_row;
     // If no resize exception happens, so this is a new Block.
     // If resize exception happens, start_row also set as zero.
     RUNTIME_CHECK(agg_process_info.start_row == 0);
 
     if likely (agg_process_info.stringHashTableRecoveryInfoEmpty())
     {
+        // sort_key_pool should already been reset by AggProcessInfo::restBlock()
+        RUNTIME_CHECK(!agg_process_info.sort_key_pool);
+
         const size_t reserve_size = rows / 4;
 
 #define M(INFO, DATA, SUBMAPINDEX, KEYTYPE)                                                          \
@@ -1106,7 +1111,9 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
 
         for (size_t i = 0; i < rows; ++i)
         {
-            auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_containers);
+            // Use Arena for collation sort key, because we are doing agg in column-wise way.
+            // So a big arena is needed to store decoded key, and we can avoid resize std::string by using Arena.
+            auto key_holder = state.getKeyHolder(i, aggregates_pool, sort_key_pool.get());
             dispatchStringHashTable(
                 i,
                 key_holder,
@@ -1142,7 +1149,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     bool zero_agg_func_size = (params.aggregates_size == 0);
 
 #define M(INDEX, INFO, DATA, PLACES)                                                                               \
-    if (!(INFO).empty())                                                                                      \
+    if (!got_resize_exception && !(INFO).empty())                                                                  \
     {                                                                                                              \
         if (zero_agg_func_size)                                                                                    \
             emplaced_index = emplaceOrFindStringKey<INDEX, collect_hit_rate, only_lookup, enable_prefetch, true>(  \
@@ -1165,15 +1172,15 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
         if unlikely (emplaced_index != (INFO).size())                                                              \
             got_resize_exception = true;                                                                           \
     }                                                                                                              \
-    else \
-    { \
-        emplaced_index = 0; \
-    } \
+    else                                                                                                           \
+    {                                                                                                              \
+        emplaced_index = 0;                                                                                        \
+    }                                                                                                              \
     setupExceptionRecoveryInfoForStringHashTable(                                                                  \
         agg_process_info,                                                                                          \
         emplaced_index,                                                                                            \
-        (INFO),                                                                                                      \
-        (DATA),                                                                                                      \
+        (INFO),                                                                                                    \
+        (DATA),                                                                                                    \
         std::integral_constant<size_t, INDEX>{});
 
     M(0, key0_infos, key0_datas, key0_places)
@@ -1186,10 +1193,6 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     if (zero_agg_func_size)
         return;
 
-    RUNTIME_CHECK(
-        rows
-        == key0_places.size() + key8_places.size() + key16_places.size() + key24_places.size() + key_str_places.size());
-
     std::vector<AggregateDataPtr> places(rows, nullptr);
 #define M(INFO, PLACES)                        \
     for (size_t i = 0; i < (INFO).size(); ++i) \
@@ -1218,6 +1221,12 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     }
     // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used.
     agg_process_info.start_row = got_resize_exception ? 0 : agg_process_info.end_row;
+
+    if unlikely (got_resize_exception)
+    {
+        RUNTIME_CHECK(!agg_process_info.stringHashTableRecoveryInfoEmpty());
+        agg_process_info.sort_key_pool = std::move(sort_key_pool);
+    }
 }
 
 void NO_INLINE
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index f5217058ff8..c6e78fb5618 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -231,8 +231,7 @@ struct AggregationMethodStringNoCache
         : data(other.data)
     {}
 
-    using State = ColumnsHashing::
-        HashMethodString<typename Data::value_type, Mapped, /*place_string_to_arena=*/true, /*use_cache=*/false>;
+    using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, /*use_cache=*/false>;
     template <bool only_lookup>
     struct EmplaceOrFindKeyResult
     {
@@ -528,7 +527,7 @@ struct AggregationMethodFixedStringNoCache
         : data(other.data)
     {}
 
-    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, true, false>;
+    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, false>;
     template <bool only_lookup>
     struct EmplaceOrFindKeyResult
     {
@@ -1326,12 +1325,12 @@ class Aggregator
         std::vector<size_t> submap_m2_infos{};
         std::vector<size_t> submap_m3_infos{};
         std::vector<size_t> submap_m4_infos{};
-
         std::vector<StringRef> submap_m0_datas{};
         std::vector<StringKey8> submap_m1_datas{};
         std::vector<StringKey16> submap_m2_datas{};
         std::vector<StringKey24> submap_m3_datas{};
         std::vector<ArenaKeyHolder> submap_m4_datas{};
+        std::unique_ptr<Arena> sort_key_pool;
 
         void prepareForAgg();
         bool allBlockDataHandled() const
@@ -1343,8 +1342,8 @@ class Aggregator
         }
         bool stringHashTableRecoveryInfoEmpty() const
         {
-            return submap_m0_infos.empty() && submap_m1_infos.empty() &&
-                submap_m3_infos.empty() && submap_m4_infos.empty();
+            return submap_m0_infos.empty() && submap_m1_infos.empty() && submap_m3_infos.empty()
+                && submap_m4_infos.empty();
         }
         void resetBlock(const Block & block_)
         {
@@ -1358,6 +1357,8 @@ class Aggregator
             hit_row_cnt = 0;
             not_found_rows.clear();
             not_found_rows.reserve(block_.rows() / 2);
+
+            sort_key_pool.reset();
         }
     };
 
diff --git a/dbms/src/Interpreters/JoinPartition.cpp b/dbms/src/Interpreters/JoinPartition.cpp
index a060878c4f7..294c72c19a3 100644
--- a/dbms/src/Interpreters/JoinPartition.cpp
+++ b/dbms/src/Interpreters/JoinPartition.cpp
@@ -412,7 +412,7 @@ struct KeyGetterForTypeImpl<JoinMapMethod::key64, Value, Mapped>
 template <typename Value, typename Mapped>
 struct KeyGetterForTypeImpl<JoinMapMethod::key_string, Value, Mapped>
 {
-    using Type = ColumnsHashing::HashMethodString<Value, Mapped, true, false>;
+    using Type = ColumnsHashing::HashMethodString<Value, Mapped, false>;
 };
 template <typename Value, typename Mapped>
 struct KeyGetterForTypeImpl<JoinMapMethod::key_strbinpadding, Value, Mapped>
@@ -427,7 +427,7 @@ struct KeyGetterForTypeImpl<JoinMapMethod::key_strbin, Value, Mapped>
 template <typename Value, typename Mapped>
 struct KeyGetterForTypeImpl<JoinMapMethod::key_fixed_string, Value, Mapped>
 {
-    using Type = ColumnsHashing::HashMethodFixedString<Value, Mapped, true, false>;
+    using Type = ColumnsHashing::HashMethodFixedString<Value, Mapped, false>;
 };
 template <typename Value, typename Mapped>
 struct KeyGetterForTypeImpl<JoinMapMethod::keys128, Value, Mapped>
@@ -652,18 +652,18 @@ void NO_INLINE insertBlockIntoMapsTypeCase(
         insert_indexes.emplace_back(insert_index);
     }
 
-#define INSERT_TO_MAP(join_partition, segment_index)          \
-    auto & current_map = (join_partition)->getHashMap<Map>(); \
-    for (auto & s_i : (segment_index))                        \
-    {                                                         \
-        Inserter<STRICTNESS, Map, KeyGetter>::insert(         \
-            current_map,                                      \
-            key_getter,                                       \
-            stored_block,                                     \
-            s_i,                                              \
-            pool,                                             \
-            sort_key_containers,                              \
-            probe_cache_column_threshold);                    \
+#define INSERT_TO_MAP(join_partition, segment_index)            \
+    auto & current_map = (join_partition) -> getHashMap<Map>(); \
+    for (auto & s_i : (segment_index))                          \
+    {                                                           \
+        Inserter<STRICTNESS, Map, KeyGetter>::insert(           \
+            current_map,                                        \
+            key_getter,                                         \
+            stored_block,                                       \
+            s_i,                                                \
+            pool,                                               \
+            sort_key_containers,                                \
+            probe_cache_column_threshold);                      \
     }
 
 #define INSERT_TO_NOT_INSERTED_MAP                                                                      \
diff --git a/dbms/src/Interpreters/SetVariants.h b/dbms/src/Interpreters/SetVariants.h
index a1591f8c13a..5c503240b7b 100644
--- a/dbms/src/Interpreters/SetVariants.h
+++ b/dbms/src/Interpreters/SetVariants.h
@@ -54,7 +54,7 @@ struct SetMethodString
 
     Data data;
 
-    using State = ColumnsHashing::HashMethodString<typename Data::value_type, VoidMapped, true, false>;
+    using State = ColumnsHashing::HashMethodString<typename Data::value_type, VoidMapped, false>;
 };
 
 template <typename TData, bool padding>
@@ -77,7 +77,7 @@ struct SetMethodFixedString
 
     Data data;
 
-    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, VoidMapped, true, false>;
+    using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, VoidMapped, false>;
 };
 
 namespace set_impl
diff --git a/dbms/src/TiDB/Collation/Collator.cpp b/dbms/src/TiDB/Collation/Collator.cpp
index bf27400f8c4..4365f1f0988 100644
--- a/dbms/src/TiDB/Collation/Collator.cpp
+++ b/dbms/src/TiDB/Collation/Collator.cpp
@@ -192,6 +192,11 @@ class BinCollator final : public ITiDBCollator
         return DB::BinCollatorSortKey<padding>(s, length);
     }
 
+    StringRef sortKey(const char * s, size_t length, DB::Arena &) const override
+    {
+        return DB::BinCollatorSortKey<padding>(s, length);
+    }
+
     StringRef sortKeyNoTrim(const char * s, size_t length, std::string &) const override
     {
         return convertForBinCollator<false>(s, length, nullptr);
@@ -273,11 +278,54 @@ class GeneralCICollator final : public ITiDBCollator
         return convertImpl<false, true>(s, length, container, nullptr);
     }
 
+    StringRef sortKey(const char * s, size_t length, DB::Arena & pool) const override
+    {
+        return convertImpl<false, true>(s, length, pool, nullptr);
+    }
+
     StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
     {
         return convertImpl<false, false>(s, length, container, nullptr);
     }
 
+    template <bool need_len, bool need_trim>
+    StringRef convertImpl(const char * s, size_t length, DB::Arena & pool, std::vector<size_t> * lens) const
+    {
+        std::string_view v;
+
+        if constexpr (need_trim)
+            v = rtrim(s, length);
+        else
+            v = std::string_view(s, length);
+
+        const size_t size = length * sizeof(WeightType);
+        auto * buffer = pool.alignedAlloc(size, 16);
+
+        size_t offset = 0;
+        size_t total_size = 0;
+        size_t v_length = v.length();
+
+        if constexpr (need_len)
+        {
+            if (lens->capacity() < v_length)
+                lens->reserve(v_length);
+            lens->resize(0);
+        }
+
+        while (offset < v_length)
+        {
+            auto c = decodeChar(s, offset);
+            auto sk = weight(c);
+            buffer[total_size++] = static_cast<char>(sk >> 8);
+            buffer[total_size++] = static_cast<char>(sk);
+
+            if constexpr (need_len)
+                lens->push_back(2);
+        }
+
+        return StringRef(buffer, total_size);
+    }
+
     template <bool need_len, bool need_trim>
     StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const
     {
@@ -479,11 +527,65 @@ class UCACICollator final : public ITiDBCollator
         return convertImpl<false, true>(s, length, container, nullptr);
     }
 
+    StringRef sortKey(const char * s, size_t length, DB::Arena & pool) const override
+    {
+        return convertImpl<false, true>(s, length, pool, nullptr);
+    }
+
     StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const override
     {
         return convertImpl<false, false>(s, length, container, nullptr);
     }
 
+    // Use Arena to store decoded string. Normally it's used by column-wise Agg/Join,
+    // because column-wise process cannot reuse string container.
+    template <bool need_len, bool need_trim>
+    StringRef convertImpl(const char * s, size_t length, DB::Arena & pool, std::vector<size_t> * lens) const
+    {
+        std::string_view v;
+
+        if constexpr (need_trim)
+            v = preprocess(s, length);
+        else
+            v = std::string_view(s, length);
+
+        // every char have 8 uint16 at most.
+        const auto size = 8 * length * sizeof(uint16_t);
+        auto * buffer = pool.alignedAlloc(size, 16);
+
+        size_t offset = 0;
+        size_t total_size = 0;
+        size_t v_length = v.length();
+
+        uint64_t first = 0, second = 0;
+
+        if constexpr (need_len)
+        {
+            if (lens->capacity() < v_length)
+                lens->reserve(v_length);
+            lens->resize(0);
+        }
+
+        while (offset < v_length)
+        {
+            weight(first, second, offset, v_length, s);
+
+            if constexpr (need_len)
+                lens->push_back(total_size);
+
+            writeResult(first, buffer, total_size);
+            writeResult(second, buffer, total_size);
+
+            if constexpr (need_len)
+            {
+                size_t end_idx = lens->size() - 1;
+                (*lens)[end_idx] = total_size - (*lens)[end_idx];
+            }
+        }
+
+        return StringRef(buffer, total_size);
+    }
+
     template <bool need_len, bool need_trim>
     StringRef convertImpl(const char * s, size_t length, std::string & container, std::vector<size_t> * lens) const
     {
@@ -550,6 +652,16 @@ class UCACICollator final : public ITiDBCollator
         }
     }
 
+    static inline void writeResult(uint64_t & w, char * buffer, size_t & total_size)
+    {
+        while (w != 0)
+        {
+            buffer[total_size++] = static_cast<char>(w >> 8);
+            buffer[total_size++] = static_cast<char>(w);
+            w >>= 16;
+        }
+    }
+
     static inline bool regexEq(CharType a, CharType b) { return T::regexEq(a, b); }
 
     static inline void weight(uint64_t & first, uint64_t & second, size_t & offset, size_t length, const char * s)
diff --git a/dbms/src/TiDB/Collation/Collator.h b/dbms/src/TiDB/Collation/Collator.h
index 6bb87883ef1..08c017ba57d 100644
--- a/dbms/src/TiDB/Collation/Collator.h
+++ b/dbms/src/TiDB/Collation/Collator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <Common/Arena.h>
 #include <Common/Exception.h>
 #include <Common/UTF8Helpers.h>
 #include <TiDB/Collation/CollatorCompare.h>
@@ -101,6 +102,7 @@ class ITiDBCollator
         = 0;
     virtual StringRef sortKeyNoTrim(const char * s, size_t length, std::string & container) const = 0;
     virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0;
+    virtual StringRef sortKey(const char * s, size_t length, DB::Arena &) const = 0;
     virtual std::unique_ptr<IPattern> pattern() const = 0;
     int32_t getCollatorId() const { return collator_id; }
     CollatorType getCollatorType() const { return collator_type; }
@@ -135,6 +137,14 @@ class ITiDBCollator
         }
         return sortKey(s, length, container);
     }
+    ALWAYS_INLINE inline StringRef sortKeyFastPath(const char * s, size_t length, DB::Arena & pool) const
+    {
+        if (likely(isPaddingBinary()))
+        {
+            return DB::BinCollatorSortKey<true>(s, length);
+        }
+        return sortKey(s, length, pool);
+    }
 
 protected:
     explicit ITiDBCollator(int32_t collator_id_);

From 3a86617a9c870be81bf28247d5b6d033cc9b6a1d Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Mon, 2 Dec 2024 17:16:26 +0800
Subject: [PATCH 11/24] unit test

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/FailPoint.cpp                 |   1 +
 .../tests/gtest_aggregation_executor.cpp      | 111 +++++++++++-------
 dbms/src/Flash/tests/gtest_compute_server.cpp |   4 +
 dbms/src/Interpreters/Aggregator.cpp          |   9 +-
 4 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp
index f6025741325..f73f273dd48 100644
--- a/dbms/src/Common/FailPoint.cpp
+++ b/dbms/src/Common/FailPoint.cpp
@@ -114,6 +114,7 @@ namespace DB
     M(force_set_parallel_prehandle_threshold)                \
     M(force_raise_prehandle_exception)                       \
     M(force_agg_on_partial_block)                            \
+    M(force_agg_prefetch)                                    \
     M(force_set_fap_candidate_store_id)                      \
     M(force_not_clean_fap_on_destroy)                        \
     M(force_fap_worker_throw)                                \
diff --git a/dbms/src/Flash/tests/gtest_aggregation_executor.cpp b/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
index 7193f24eddb..8c7f5277916 100644
--- a/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
+++ b/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
@@ -24,6 +24,7 @@ namespace DB
 namespace FailPoints
 {
 extern const char force_agg_on_partial_block[];
+extern const char force_agg_prefetch[];
 extern const char force_agg_two_level_hash_table_before_merge[];
 } // namespace FailPoints
 namespace tests
@@ -238,16 +239,22 @@ class AggExecutorTestRunner : public ExecutorTest
     ColumnWithUInt64 col_pr{1, 2, 0, 3290124, 968933, 3125, 31236, 4327, 80000};
 };
 
-#define WRAP_FOR_AGG_PARTIAL_BLOCK_START                                              \
-    std::vector<bool> partial_blocks{true, false};                                    \
-    for (auto partial_block : partial_blocks)                                         \
-    {                                                                                 \
-        if (partial_block)                                                            \
-            FailPointHelper::enableFailPoint(FailPoints::force_agg_on_partial_block); \
-        else                                                                          \
-            FailPointHelper::disableFailPoint(FailPoints::force_agg_on_partial_block);
+#define WRAP_FOR_AGG_FAILPOINTS_START                                                  \
+    std::vector<bool> enables{true, false};                                            \
+    for (auto enable : enables)                                                        \
+    {                                                                                  \
+        if (enable)                                                                    \
+        {                                                                              \
+            FailPointHelper::enableFailPoint(FailPoints::force_agg_on_partial_block);  \
+            FailPointHelper::enableFailPoint(FailPoints::force_agg_prefetch);          \
+        }                                                                              \
+        else                                                                           \
+        {                                                                              \
+            FailPointHelper::disableFailPoint(FailPoints::force_agg_on_partial_block); \
+            FailPointHelper::disableFailPoint(FailPoints::force_agg_prefetch);         \
+        }
 
-#define WRAP_FOR_AGG_PARTIAL_BLOCK_END }
+#define WRAP_FOR_AGG_FAILPOINTS_END }
 
 /// Guarantee the correctness of group by
 TEST_F(AggExecutorTestRunner, GroupBy)
@@ -363,9 +370,9 @@ try
                     FailPointHelper::enableFailPoint(FailPoints::force_agg_two_level_hash_table_before_merge);
                 else
                     FailPointHelper::disableFailPoint(FailPoints::force_agg_two_level_hash_table_before_merge);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                WRAP_FOR_AGG_FAILPOINTS_START
                 executeAndAssertColumnsEqual(request, expect_cols[i]);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                WRAP_FOR_AGG_FAILPOINTS_END
             }
         }
     }
@@ -429,9 +436,9 @@ try
                     FailPointHelper::enableFailPoint(FailPoints::force_agg_two_level_hash_table_before_merge);
                 else
                     FailPointHelper::disableFailPoint(FailPoints::force_agg_two_level_hash_table_before_merge);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                WRAP_FOR_AGG_FAILPOINTS_START
                 executeAndAssertColumnsEqual(request, expect_cols[i]);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                WRAP_FOR_AGG_FAILPOINTS_END
             }
         }
     }
@@ -464,9 +471,9 @@ try
     for (size_t i = 0; i < test_num; ++i)
     {
         request = buildDAGRequest(std::make_pair(db_name, table_name), agg_funcs[i], group_by_exprs[i], projections[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expect_cols[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     /// Min function tests
@@ -485,9 +492,9 @@ try
     for (size_t i = 0; i < test_num; ++i)
     {
         request = buildDAGRequest(std::make_pair(db_name, table_name), agg_funcs[i], group_by_exprs[i], projections[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expect_cols[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 }
 CATCH
@@ -545,9 +552,9 @@ try
     {
         request
             = buildDAGRequest(std::make_pair(db_name, table_name), {agg_funcs[i]}, group_by_exprs[i], projections[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expect_cols[i]);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 }
 CATCH
@@ -615,9 +622,9 @@ try
                 {agg_func},
                 group_by_exprs[i],
                 projections[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_START
+            WRAP_FOR_AGG_FAILPOINTS_START
             executeAndAssertColumnsEqual(request, expect_cols[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_END
+            WRAP_FOR_AGG_FAILPOINTS_END
         }
     }
     {
@@ -629,9 +636,9 @@ try
                 {agg_func},
                 group_by_exprs[i],
                 projections[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_START
+            WRAP_FOR_AGG_FAILPOINTS_START
             executeAndAssertColumnsEqual(request, expect_cols[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_END
+            WRAP_FOR_AGG_FAILPOINTS_END
         }
     }
     for (auto collation_id : {0, static_cast<int>(TiDB::ITiDBCollator::BINARY)})
@@ -668,9 +675,9 @@ try
                 {agg_func},
                 group_by_exprs[i],
                 projections[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_START
+            WRAP_FOR_AGG_FAILPOINTS_START
             executeAndAssertColumnsEqual(request, expect_cols[i]);
-            WRAP_FOR_AGG_PARTIAL_BLOCK_END
+            WRAP_FOR_AGG_FAILPOINTS_END
         }
     }
 }
@@ -683,9 +690,9 @@ try
     executeAndAssertColumnsEqual(request, {{toNullableVec<String>({"banana"})}});
 
     request = context.scan("aggnull_test", "t1").aggregation({}, {col("s1")}).build(context);
-    WRAP_FOR_AGG_PARTIAL_BLOCK_START
+    WRAP_FOR_AGG_FAILPOINTS_START
     executeAndAssertColumnsEqual(request, {{toNullableVec<String>("s1", {{}, "banana"})}});
-    WRAP_FOR_AGG_PARTIAL_BLOCK_END
+    WRAP_FOR_AGG_FAILPOINTS_END
 }
 CATCH
 
@@ -697,9 +704,9 @@ try
         = {toNullableVec<Int64>({3}), toNullableVec<Int64>({1}), toVec<UInt64>({6})};
     auto test_single_function = [&](size_t index) {
         auto request = context.scan("test_db", "test_table").aggregation({functions[index]}, {}).build(context);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, {functions_result[index]});
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     };
     for (size_t i = 0; i < functions.size(); ++i)
         test_single_function(i);
@@ -720,9 +727,9 @@ try
                 results.push_back(functions_result[k]);
 
                 auto request = context.scan("test_db", "test_table").aggregation(funcs, {}).build(context);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                WRAP_FOR_AGG_FAILPOINTS_START
                 executeAndAssertColumnsEqual(request, results);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                WRAP_FOR_AGG_FAILPOINTS_END
 
                 funcs.pop_back();
                 results.pop_back();
@@ -758,9 +765,9 @@ try
                 context.context->setSetting(
                     "group_by_two_level_threshold",
                     Field(static_cast<UInt64>(two_level_threshold)));
-                WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                WRAP_FOR_AGG_FAILPOINTS_START
                 executeAndAssertColumnsEqual(request, expect);
-                WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                WRAP_FOR_AGG_FAILPOINTS_END
             }
         }
     }
@@ -791,7 +798,7 @@ try
                         "group_by_two_level_threshold",
                         Field(static_cast<UInt64>(two_level_threshold)));
                     context.context->setSetting("max_block_size", Field(static_cast<UInt64>(block_size)));
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                    WRAP_FOR_AGG_FAILPOINTS_START
                     auto blocks = getExecuteStreamsReturnBlocks(request, concurrency);
                     size_t actual_row = 0;
                     for (auto & block : blocks)
@@ -800,7 +807,7 @@ try
                         actual_row += block.rows();
                     }
                     ASSERT_EQ(actual_row, expect_rows[i]);
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                    WRAP_FOR_AGG_FAILPOINTS_END
                 }
             }
         }
@@ -914,7 +921,7 @@ try
                             "group_by_two_level_threshold",
                             Field(static_cast<UInt64>(two_level_threshold)));
                         context.context->setSetting("max_block_size", Field(static_cast<UInt64>(block_size)));
-                        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                        WRAP_FOR_AGG_FAILPOINTS_START
                         auto blocks = getExecuteStreamsReturnBlocks(request, concurrency);
                         for (auto & block : blocks)
                         {
@@ -939,7 +946,7 @@ try
                                 vstackBlocks(std::move(blocks)).getColumnsWithTypeAndName(),
                                 false));
                         }
-                        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                        WRAP_FOR_AGG_FAILPOINTS_END
                     }
                 }
             }
@@ -967,18 +974,18 @@ try
 
     request = context.receive("empty_recv", 5).aggregation({Max(col("s1"))}, {col("s2")}, 5).build(context);
     {
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, {});
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     request = context.scan("test_db", "empty_table")
                   .aggregation({Count(lit(Field(static_cast<UInt64>(1))))}, {})
                   .build(context);
     {
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, {toVec<UInt64>({0})});
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 }
 CATCH
@@ -1049,7 +1056,9 @@ try
                toNullableVec<Int8>("first_row(col_tinyint)", ColumnWithNullableInt8{0, 1, 2, 3}),
                toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3}),
                toVec<Int8>("col_tinyint", ColumnWithInt8{0, 1, 2, 3})};
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1065,7 +1074,9 @@ try
             = {toVec<UInt64>("count(1)", ColumnWithUInt64{rows_per_type, rows_per_type, rows_per_type, rows_per_type}),
                toNullableVec<Int32>("first_row(col_int)", ColumnWithNullableInt32{0, 1, 2, 3}),
                toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3})};
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1082,7 +1093,9 @@ try
             toNullableVec<String>("first_row(col_string_no_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_no_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1099,7 +1112,9 @@ try
             toNullableVec<String>("first_row(col_string_with_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_with_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1116,7 +1131,9 @@ try
             toVec<UInt64>("count(1)", ColumnWithUInt64{rows_per_type, rows_per_type, rows_per_type, rows_per_type}),
             toVec<String>("first_row(col_string_with_collator)", ColumnWithString{"a", "b", "c", "d"}),
         };
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     // case-5: none
@@ -1138,7 +1155,9 @@ try
             toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3}),
             toVec<String>("col_string_no_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1155,7 +1174,9 @@ try
             toNullableVec<String>("first_row(col_string_with_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_with_collator", ColumnWithString{"a", "b", "c", "d"}),
             toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3})};
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 }
 CATCH
@@ -1205,15 +1226,15 @@ try
     auto baseline = executeStreams(gen_request(1), 1);
     for (size_t exchange_concurrency : exchange_receiver_concurrency)
     {
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(gen_request(exchange_concurrency), baseline);
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
     }
 }
 CATCH
 
-#undef WRAP_FOR_AGG_PARTIAL_BLOCK_START
-#undef WRAP_FOR_AGG_PARTIAL_BLOCK_END
+#undef WRAP_FOR_AGG_FAILPOINTS_START
+#undef WRAP_FOR_AGG_FAILPOINTS_END
 
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Flash/tests/gtest_compute_server.cpp b/dbms/src/Flash/tests/gtest_compute_server.cpp
index 69b2242df3d..3c4020db45e 100644
--- a/dbms/src/Flash/tests/gtest_compute_server.cpp
+++ b/dbms/src/Flash/tests/gtest_compute_server.cpp
@@ -39,6 +39,7 @@ extern const char exception_before_mpp_root_task_run[];
 extern const char exception_during_mpp_non_root_task_run[];
 extern const char exception_during_mpp_root_task_run[];
 extern const char exception_during_query_run[];
+extern const char force_agg_prefetch[];
 } // namespace FailPoints
 
 namespace tests
@@ -1369,6 +1370,7 @@ try
         FailPoints::exception_during_mpp_non_root_task_run,
         FailPoints::exception_during_mpp_root_task_run,
         FailPoints::exception_during_query_run,
+        FailPoints::force_agg_prefetch,
     };
     size_t query_index = 0;
     for (const auto & failpoint : failpoint_names)
@@ -1843,6 +1845,7 @@ try
         auto_pass_through_test_data.nullable_high_ndv_tbl_name,
         auto_pass_through_test_data.nullable_medium_ndv_tbl_name,
     };
+    FailPointHelper::enableFailPoint(FailPoints::force_agg_prefetch);
     for (const auto & tbl_name : workloads)
     {
         const String db_name = auto_pass_through_test_data.db_name;
@@ -1868,6 +1871,7 @@ try
             res_no_pass_through);
         WRAP_FOR_SERVER_TEST_END
     }
+    FailPointHelper::disableFailPoint(FailPoints::force_agg_prefetch);
 }
 CATCH
 
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 543885b6248..6e3262c15d4 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -43,6 +43,7 @@ extern const char random_aggregate_create_state_failpoint[];
 extern const char random_aggregate_merge_failpoint[];
 extern const char force_agg_on_partial_block[];
 extern const char random_fail_in_resize_callback[];
+extern const char force_agg_prefetch[];
 } // namespace FailPoints
 
 #define AggregationMethodName(NAME) AggregatedDataVariants::AggregationMethod_##NAME
@@ -665,7 +666,13 @@ void NO_INLINE Aggregator::executeImpl(
 {
     typename Method::State state(agg_process_info.key_columns, key_sizes, collators);
 
-    if (method.data.getBufferSizeInCells() < 8192)
+#ifndef NDEBUG
+    bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192);
+    fiu_do_on(FailPoints::force_agg_prefetch, { disable_prefetch = false; });
+#else
+    const bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192);
+#endif
+    if (disable_prefetch)
     {
         if constexpr (Method::Data::is_string_hash_map)
             executeImplBatchStringHashMap<collect_hit_rate, only_lookup, false>(

From 623fef57f5a160e201e5f765f22430eef88f8300 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Mon, 2 Dec 2024 20:14:41 +0800
Subject: [PATCH 12/24] prefetch

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/HashTable/HashTable.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index f8d44e8c406..c0f066edbb0 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -856,14 +856,8 @@ class HashTable
 
     void ALWAYS_INLINE prefetch(size_t hashval) const
     {
-        (void)hashval;
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
-        size_t place_value = grower.place(hashval);
-        __mm_prefetch((const char *)(&buf[place_value]), _MM_HINT_NTA);
-#elif defined(__GNUC__)
-        size_t place_value = grower.place(hashval);
+        const size_t place_value = grower.place(hashval);
         __builtin_prefetch(static_cast<const void *>(&buf[place_value]));
-#endif
     }
 
 protected:

From 19f320daa2b5250ee45456ee0d1401f1cd32fa53 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Mon, 2 Dec 2024 20:28:40 +0800
Subject: [PATCH 13/24] fix

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashing.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index 94526714250..a9817308616 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -85,8 +85,6 @@ struct HashMethodOneNumber
     }
 
     const FieldType * getKeyData() const { return vec; }
-
-    size_t getTotalRows() const { return total_rows; }
 };
 
 

From 3a226dfb518caf0afa1769617d7b800f2c97ca12 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 14:14:14 +0800
Subject: [PATCH 14/24] refine

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h          |  4 +-
 .../HashTable/TwoLevelStringHashTable.h       |  2 +-
 .../tests/gtest_aggregation_executor.cpp      | 53 ++++++++++++-------
 dbms/src/Interpreters/Aggregator.cpp          | 21 +++++---
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index 1f4e3dbaedf..b5b61fb8630 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -184,7 +184,7 @@ class HashMethodBase
     ALWAYS_INLINE inline EmplaceResult emplaceStringKey(
         Data & data,
         size_t idx,
-        std::vector<StringKeyType> & datas, // TODO const
+        std::vector<StringKeyType> & datas,
         const std::vector<size_t> & hashvals)
     {
         // For spill, hashvals.size() will be le to total_rows.
@@ -204,7 +204,7 @@ class HashMethodBase
     ALWAYS_INLINE inline FindResult findStringKey(
         Data & data,
         size_t idx,
-        std::vector<StringKeyType> & datas, // TODO const
+        std::vector<StringKeyType> & datas,
         const std::vector<size_t> & hashvals)
     {
         assert(hashvals.size() <= static_cast<Derived &>(*this).total_rows);
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 5ea460769ab..ac2ab483e46 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -265,7 +265,7 @@ class TwoLevelStringHashTable : private boost::noncopyable
     {
         size_t res = 0;
         for (const auto & impl : impls)
-            res = impl.getBufferSizeInCells();
+            res += impl.getBufferSizeInCells();
         return res;
     }
     size_t getBufferSizeInBytes() const
diff --git a/dbms/src/Flash/tests/gtest_aggregation_executor.cpp b/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
index 8c7f5277916..3a79025f244 100644
--- a/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
+++ b/dbms/src/Flash/tests/gtest_aggregation_executor.cpp
@@ -1042,6 +1042,24 @@ try
             toVec<Int8>("col_tinyint", col_data_tinyint),
         });
 
+    std::vector<size_t> max_block_sizes{1, 2, DEFAULT_BLOCK_SIZE};
+    std::vector<UInt64> two_level_thresholds{0, 1};
+
+    context.context->setSetting("group_by_two_level_threshold_bytes", Field(static_cast<UInt64>(0)));
+#define WRAP_FOR_AGG_STRING_TEST_BEGIN                                \
+    for (const auto & max_block_size : max_block_sizes)               \
+    {                                                                 \
+        for (const auto & two_level_threshold : two_level_thresholds) \
+        {                                                             \
+            context.context->setSetting(                              \
+                "group_by_two_level_threshold",                       \
+                Field(static_cast<UInt64>(two_level_threshold)));     \
+            context.context->setSetting("max_block_size", Field(static_cast<UInt64>(max_block_size)));
+#define WRAP_FOR_AGG_STRING_TEST_END \
+    }                                \
+    }
+
+    FailPointHelper::enableFailPoint(FailPoints::force_agg_prefetch);
     {
         // case-1: select count(1), col_tinyint from t group by col_int, col_tinyint
         // agg method: keys64(AggregationMethodKeysFixed)
@@ -1056,9 +1074,9 @@ try
                toNullableVec<Int8>("first_row(col_tinyint)", ColumnWithNullableInt8{0, 1, 2, 3}),
                toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3}),
                toVec<Int8>("col_tinyint", ColumnWithInt8{0, 1, 2, 3})};
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
 
     {
@@ -1074,9 +1092,9 @@ try
             = {toVec<UInt64>("count(1)", ColumnWithUInt64{rows_per_type, rows_per_type, rows_per_type, rows_per_type}),
                toNullableVec<Int32>("first_row(col_int)", ColumnWithNullableInt32{0, 1, 2, 3}),
                toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3})};
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
 
     {
@@ -1093,9 +1111,7 @@ try
             toNullableVec<String>("first_row(col_string_no_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_no_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
-        WRAP_FOR_AGG_FAILPOINTS_START
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
     }
 
     {
@@ -1112,9 +1128,9 @@ try
             toNullableVec<String>("first_row(col_string_with_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_with_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
 
     {
@@ -1131,9 +1147,9 @@ try
             toVec<UInt64>("count(1)", ColumnWithUInt64{rows_per_type, rows_per_type, rows_per_type, rows_per_type}),
             toVec<String>("first_row(col_string_with_collator)", ColumnWithString{"a", "b", "c", "d"}),
         };
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
 
     // case-5: none
@@ -1155,9 +1171,9 @@ try
             toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3}),
             toVec<String>("col_string_no_collator", ColumnWithString{"a", "b", "c", "d"}),
         };
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
 
     {
@@ -1174,10 +1190,13 @@ try
             toNullableVec<String>("first_row(col_string_with_collator)", ColumnWithNullableString{"a", "b", "c", "d"}),
             toVec<String>("col_string_with_collator", ColumnWithString{"a", "b", "c", "d"}),
             toVec<Int32>("col_int", ColumnWithInt32{0, 1, 2, 3})};
-        WRAP_FOR_AGG_FAILPOINTS_START
+        WRAP_FOR_AGG_STRING_TEST_BEGIN
         executeAndAssertColumnsEqual(request, expected);
-        WRAP_FOR_AGG_FAILPOINTS_END
+        WRAP_FOR_AGG_STRING_TEST_END
     }
+    FailPointHelper::disableFailPoint(FailPoints::force_agg_prefetch);
+#undef WRAP_FOR_AGG_STRING_TEST_BEGIN
+#undef WRAP_FOR_AGG_STRING_TEST_END
 }
 CATCH
 
@@ -1208,13 +1227,9 @@ try
 
     context
         .addExchangeReceiver("exchange_receiver_1_concurrency", column_infos, column_data, 1, partition_column_infos);
-    context
-        .addExchangeReceiver("exchange_receiver_3_concurrency", column_infos, column_data, 3, partition_column_infos);
-    context
-        .addExchangeReceiver("exchange_receiver_5_concurrency", column_infos, column_data, 5, partition_column_infos);
     context
         .addExchangeReceiver("exchange_receiver_10_concurrency", column_infos, column_data, 10, partition_column_infos);
-    std::vector<size_t> exchange_receiver_concurrency = {1, 3, 5, 10};
+    std::vector<size_t> exchange_receiver_concurrency = {1, 10};
 
     auto gen_request = [&](size_t exchange_concurrency) {
         return context
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 6e3262c15d4..2c1aba6b2af 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -741,8 +741,9 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     }
 }
 
+// This is only used by executeImplBatchStringHashMap.
+// It will choose specifix submap of StringHashMap then do emplace/find.
 // StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder.
-// return true when resize exception happens.
 template <
     size_t SubMapIndex,
     bool collect_hit_rate,
@@ -756,7 +757,7 @@ size_t Aggregator::emplaceOrFindStringKey(
     Data & data,
     State & state,
     const std::vector<size_t> & key_infos,
-    std::vector<StringKeyType> & key_datas, // TODO const
+    std::vector<StringKeyType> & key_datas,
     Arena & aggregates_pool,
     std::vector<AggregateDataPtr> & places,
     AggProcessInfo & agg_process_info) const
@@ -1045,8 +1046,8 @@ M(4)
 
 #undef M
 
-// In this function, we will prefetch/empalce each specifix submap directly instead of accessing StringHashMap interface,
-// which is good for performance.
+// prefetch/empalce each specifix submap directly instead of accessing StringHashMap interface,
+// which is better for performance.
 // NOTE: this function is column-wise, which means sort key buffer cannot be reused.
 // This buffer will not be release until this block is processed done.
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
@@ -1088,7 +1089,7 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
 #undef M
 
     // If no resize exception happens, so this is a new Block.
-    // If resize exception happens, start_row also set as zero.
+    // If resize exception happens, start_row has already been set to zero at the end of this function.
     RUNTIME_CHECK(agg_process_info.start_row == 0);
 
     if likely (agg_process_info.stringHashTableRecoveryInfoEmpty())
@@ -1226,13 +1227,17 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
             inst->batch_arguments,
             aggregates_pool);
     }
-    // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used.
-    agg_process_info.start_row = got_resize_exception ? 0 : agg_process_info.end_row;
-
     if unlikely (got_resize_exception)
     {
         RUNTIME_CHECK(!agg_process_info.stringHashTableRecoveryInfoEmpty());
         agg_process_info.sort_key_pool = std::move(sort_key_pool);
+        // For StringHashTable, start_row is meanless, instead submap_mx_infos/submap_mx_datas are used.
+        // So set it to zero when got_resize_exception.
+        agg_process_info.start_row = 0;
+    }
+    else
+    {
+        agg_process_info.start_row = agg_process_info.end_row;
     }
 }
 

From c44ace7030558dff7952fde6aae9aedb6c2bd400 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 15:50:56 +0800
Subject: [PATCH 15/24] refine

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Interpreters/Aggregator.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 2c1aba6b2af..e5e214c7791 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -674,13 +674,13 @@ void NO_INLINE Aggregator::executeImpl(
 #endif
     if (disable_prefetch)
     {
-        if constexpr (Method::Data::is_string_hash_map)
-            executeImplBatchStringHashMap<collect_hit_rate, only_lookup, false>(
-                method,
-                state,
-                aggregates_pool,
-                agg_process_info);
-        else
+        // if constexpr (Method::Data::is_string_hash_map)
+        //     executeImplBatchStringHashMap<collect_hit_rate, only_lookup, false>(
+        //         method,
+        //         state,
+        //         aggregates_pool,
+        //         agg_process_info);
+        // else
             executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
     }
     else

From 3e30f9561e38d09c39d1b0dc73735b6dcffb7991 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 16:08:34 +0800
Subject: [PATCH 16/24] revert new hasher

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/HashTable/StringHashTable.h   | 66 +++++++++++++++----
 .../HashTable/TwoLevelStringHashTable.h       |  8 +--
 dbms/src/Interpreters/Aggregator.h            | 20 +++---
 3 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index a43f35fdbbf..322523388cc 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -67,17 +67,57 @@ struct HashWithMixSeed<StringKey24>
     }
 };
 
+// struct StringHashTableHash
+// {
+//     using StringKey8Hasher = HashWithMixSeed<StringKey8>;
+//     using StringKey16Hasher = HashWithMixSeed<StringKey16>;
+//     using StringKey24Hasher = HashWithMixSeed<StringKey24>;
+//     using StringRefHasher = StringRefHash;
+// 
+//     static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
+// };
 struct StringHashTableHash
 {
-    using StringKey8Hasher = HashWithMixSeed<StringKey8>;
-    using StringKey16Hasher = HashWithMixSeed<StringKey16>;
-    using StringKey24Hasher = HashWithMixSeed<StringKey24>;
-    using StringRefHasher = StringRefHash;
-
-    static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
+#if defined(__SSE4_2__)
+    static size_t ALWAYS_INLINE operator()(StringKey8 key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key);
+        return res;
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.low);
+        res = _mm_crc32_u64(res, key.high);
+        return res;
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.a);
+        res = _mm_crc32_u64(res, key.b);
+        res = _mm_crc32_u64(res, key.c);
+        return res;
+    }
+#else
+    static size_t ALWAYS_INLINE operator()(StringKey8 key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
+    }
+#endif
+    static size_t ALWAYS_INLINE operator()(StringRef key){ return StringRefHash()(key); }
 };
 
 template <typename Cell>
@@ -572,7 +612,7 @@ struct StringHashTableSubMapSelector<0, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey8Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T1 & getSubMap(size_t, Data & data) { return data.m1; }
 };
@@ -580,7 +620,7 @@ struct StringHashTableSubMapSelector<1, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey16Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T2 & getSubMap(size_t, Data & data) { return data.m2; }
 };
@@ -588,7 +628,7 @@ struct StringHashTableSubMapSelector<2, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey24Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T3 & getSubMap(size_t, Data & data) { return data.m3; }
 };
@@ -596,7 +636,7 @@ struct StringHashTableSubMapSelector<3, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, false, Data>
 {
-    using Hash = StringHashTableHash::StringRefHasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Ts & getSubMap(size_t, Data & data) { return data.ms; }
 };
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index ac2ab483e46..403b8d3941c 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -296,7 +296,7 @@ struct StringHashTableSubMapSelector<0, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey8Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
@@ -308,7 +308,7 @@ struct StringHashTableSubMapSelector<1, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey16Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
@@ -320,7 +320,7 @@ struct StringHashTableSubMapSelector<2, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey24Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
@@ -332,7 +332,7 @@ struct StringHashTableSubMapSelector<3, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, true, Data>
 {
-    using Hash = StringHashTableHash::StringRefHasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index c6e78fb5618..eb68bc50ae9 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -77,27 +77,27 @@ using AggregatedDataWithoutKey = AggregateDataPtr;
 using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
 using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
 
-using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
-using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
 
 using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
+using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
 
-using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
-using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,

From ea85d19ff8dc4e97abbbf71bb05a424632b17684 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 17:10:09 +0800
Subject: [PATCH 17/24] debug low distinct value

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h | 121 ++++++++++++++++++++-
 dbms/src/Interpreters/Aggregator.cpp | 154 ++++++++++++++++++++-------
 dbms/src/Interpreters/Aggregator.h   |   8 ++
 3 files changed, 244 insertions(+), 39 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index b5b61fb8630..f5cc03d82c8 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -138,13 +138,35 @@ class HashMethodBase
             map.prefetch(hashvals[prefetch_idx]);
     }
 
+    template <typename Data>
+    ALWAYS_INLINE inline EmplaceResult emplaceKey(
+        Data & data,
+        size_t row,
+        Arena & pool,
+        std::vector<String> & sort_key_containers)
+    {
+        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
+        return emplaceImpl(key_holder, data);
+    }
+
+    template <typename Data>
+    ALWAYS_INLINE inline FindResult findKey(
+        Data & data,
+        size_t row,
+        Arena & pool,
+        std::vector<String> & sort_key_containers)
+    {
+        auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
+        return findKeyImpl<false>(keyHolderGetKey(key_holder), data, 0);
+    }
+
     template <bool enable_prefetch = false, typename Data>
     ALWAYS_INLINE inline EmplaceResult emplaceKey(
         Data & data,
         size_t row,
         Arena & pool,
         std::vector<String> & sort_key_containers,
-        const std::vector<size_t> & hashvals = {})
+        const std::vector<size_t> & hashvals)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         if constexpr (enable_prefetch)
@@ -165,7 +187,7 @@ class HashMethodBase
         size_t row,
         Arena & pool,
         std::vector<String> & sort_key_containers,
-        const std::vector<size_t> & hashvals = {})
+        const std::vector<size_t> & hashvals)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         if constexpr (enable_prefetch)
@@ -247,6 +269,60 @@ class HashMethodBase
         }
     }
 
+    template <typename Data, typename KeyHolder>
+    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
+    {
+        if constexpr (Cache::consecutive_keys_optimization)
+        {
+            if (cache.found && cache.check(keyHolderGetKey(key_holder)))
+            {
+                if constexpr (has_mapped)
+                    return EmplaceResult(cache.value.second, cache.value.second, false);
+                else
+                    return EmplaceResult(false);
+            }
+        }
+
+        typename Data::LookupResult it;
+        bool inserted = false;
+
+        data.emplace(key_holder, it, inserted);
+
+        [[maybe_unused]] Mapped * cached = nullptr;
+        if constexpr (has_mapped)
+            cached = &it->getMapped();
+
+        if (inserted)
+        {
+            if constexpr (has_mapped)
+            {
+                new (&it->getMapped()) Mapped();
+            }
+        }
+
+        if constexpr (consecutive_keys_optimization)
+        {
+            cache.found = true;
+            cache.empty = false;
+
+            if constexpr (has_mapped)
+            {
+                cache.value.first = it->getKey();
+                cache.value.second = it->getMapped();
+                cached = &cache.value.second;
+            }
+            else
+            {
+                cache.value = it->getKey();
+            }
+        }
+
+        if constexpr (has_mapped)
+            return EmplaceResult(it->getMapped(), *cached, inserted);
+        else
+            return EmplaceResult(inserted);
+    }
+
     template <bool use_hashval, typename Data, typename KeyHolder>
     ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
     {
@@ -304,6 +380,47 @@ class HashMethodBase
             return EmplaceResult(inserted);
     }
 
+    template <typename Data, typename Key>
+    ALWAYS_INLINE inline FindResult findKeyImpl(Key & key, Data & data)
+    {
+        if constexpr (Cache::consecutive_keys_optimization)
+        {
+            if (cache.check(key))
+            {
+                if constexpr (has_mapped)
+                    return FindResult(&cache.value.second, cache.found);
+                else
+                    return FindResult(cache.found);
+            }
+        }
+
+        typename Data::LookupResult it;
+        it = data.find(key);
+
+        if constexpr (consecutive_keys_optimization)
+        {
+            cache.found = it != nullptr;
+            cache.empty = false;
+
+            if constexpr (has_mapped)
+            {
+                cache.value.first = key;
+                if (it)
+                {
+                    cache.value.second = it->getMapped();
+                }
+            }
+            else
+            {
+                cache.value = key;
+            }
+        }
+
+        if constexpr (has_mapped)
+            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr);
+        else
+            return FindResult(it != nullptr);
+    }
     template <bool use_hashval, typename Data, typename Key>
     ALWAYS_INLINE inline FindResult findKeyImpl(Key & key, Data & data, size_t hashval)
     {
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index e5e214c7791..3368a3e9bfe 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -741,6 +741,27 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     }
 }
 
+template <bool only_lookup, typename Method>
+std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> Aggregator::emplaceOrFindKey(
+    Method & method,
+    typename Method::State & state,
+    size_t index,
+    Arena & aggregates_pool,
+    std::vector<std::string> & sort_key_containers) const
+{
+    try
+    {
+        if constexpr (only_lookup)
+            return state.findKey(method.data, index, aggregates_pool, sort_key_containers);
+        else
+            return state.emplaceKey(method.data, index, aggregates_pool, sort_key_containers);
+    }
+    catch (ResizeException &)
+    {
+        return {};
+    }
+}
+
 // This is only used by executeImplBatchStringHashMap.
 // It will choose specifix submap of StringHashMap then do emplace/find.
 // StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder.
@@ -937,9 +958,9 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     /// Generic case.
     std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
     std::optional<size_t> processed_rows;
-    std::vector<size_t> hashvals;
     if constexpr (enable_prefetch)
     {
+        std::vector<size_t> hashvals;
         hashvals = getHashVals(
             agg_process_info.start_row,
             agg_process_info.end_row,
@@ -947,64 +968,123 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
             state,
             sort_key_containers,
             aggregates_pool);
-    }
-
-    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
-    {
-        AggregateDataPtr aggregate_data = nullptr;
 
-        auto emplace_result_holder = emplaceOrFindKey<only_lookup, enable_prefetch>(
-            method,
-            state,
-            i,
-            *aggregates_pool,
-            sort_key_containers,
-            hashvals);
-        if unlikely (!emplace_result_holder.has_value())
+        for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
         {
-            LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
-            break;
-        }
+            AggregateDataPtr aggregate_data = nullptr;
 
-        auto & emplace_result = emplace_result_holder.value();
+            auto emplace_result_holder = emplaceOrFindKey<only_lookup, enable_prefetch>(
+                method,
+                state,
+                i,
+                *aggregates_pool,
+                sort_key_containers,
+                hashvals);
+            if unlikely (!emplace_result_holder.has_value())
+            {
+                LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
+                break;
+            }
 
-        if constexpr (only_lookup)
-        {
-            if (emplace_result.isFound())
+            auto & emplace_result = emplace_result_holder.value();
+
+            if constexpr (only_lookup)
             {
-                aggregate_data = emplace_result.getMapped();
+                if (emplace_result.isFound())
+                {
+                    aggregate_data = emplace_result.getMapped();
+                }
+                else
+                {
+                    agg_process_info.not_found_rows.push_back(i);
+                }
             }
             else
             {
-                agg_process_info.not_found_rows.push_back(i);
+                /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+                if (emplace_result.isInserted())
+                {
+                    /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+                    emplace_result.setMapped(nullptr);
+
+                    aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                    createAggregateStates(aggregate_data);
+
+                    emplace_result.setMapped(aggregate_data);
+                }
+                else
+                {
+                    aggregate_data = emplace_result.getMapped();
+
+                    if constexpr (collect_hit_rate)
+                        ++agg_process_info.hit_row_cnt;
+                }
             }
+
+            places[i - agg_process_info.start_row] = aggregate_data;
+            processed_rows = i;
         }
-        else
+    }
+    else
+    {
+        LOG_DEBUG(log, "gjt debug original path");
+        for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
         {
-            /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-            if (emplace_result.isInserted())
+            AggregateDataPtr aggregate_data = nullptr;
+
+            auto emplace_result_holder = emplaceOrFindKey<only_lookup>(
+                method,
+                state,
+                i,
+                *aggregates_pool,
+                sort_key_containers);
+            if unlikely (!emplace_result_holder.has_value())
             {
-                /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                emplace_result.setMapped(nullptr);
+                LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
+                break;
+            }
 
-                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                createAggregateStates(aggregate_data);
+            auto & emplace_result = emplace_result_holder.value();
 
-                emplace_result.setMapped(aggregate_data);
+            if constexpr (only_lookup)
+            {
+                if (emplace_result.isFound())
+                {
+                    aggregate_data = emplace_result.getMapped();
+                }
+                else
+                {
+                    agg_process_info.not_found_rows.push_back(i);
+                }
             }
             else
             {
-                aggregate_data = emplace_result.getMapped();
+                /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
+                if (emplace_result.isInserted())
+                {
+                    /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
+                    emplace_result.setMapped(nullptr);
 
-                if constexpr (collect_hit_rate)
-                    ++agg_process_info.hit_row_cnt;
+                    aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+                    createAggregateStates(aggregate_data);
+
+                    emplace_result.setMapped(aggregate_data);
+                }
+                else
+                {
+                    aggregate_data = emplace_result.getMapped();
+
+                    if constexpr (collect_hit_rate)
+                        ++agg_process_info.hit_row_cnt;
+                }
             }
-        }
 
-        places[i - agg_process_info.start_row] = aggregate_data;
-        processed_rows = i;
+            places[i - agg_process_info.start_row] = aggregate_data;
+            processed_rows = i;
+        }
     }
 
+
     if (processed_rows)
     {
         /// Add values to the aggregate functions.
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index eb68bc50ae9..729ba863130 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1499,6 +1499,14 @@ class Aggregator
         std::vector<std::string> & sort_key_containers,
         const std::vector<size_t> & hashvals) const;
 
+    template <bool only_lookup, typename Method>
+    std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> emplaceOrFindKey(
+        Method & method,
+        typename Method::State & state,
+        size_t index,
+        Arena & aggregates_pool,
+        std::vector<std::string> & sort_key_containers) const;
+
     template <
         size_t SubMapIndex,
         bool collect_hit_rate,

From 16937ff2cdd5e3fb7cde826887c0bfd252940bf3 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 17:36:18 +0800
Subject: [PATCH 18/24] Revert "revert new hasher"

This reverts commit 3e30f9561e38d09c39d1b0dc73735b6dcffb7991.
---
 dbms/src/Common/HashTable/StringHashTable.h   | 66 ++++---------------
 .../HashTable/TwoLevelStringHashTable.h       |  8 +--
 dbms/src/Interpreters/Aggregator.h            | 20 +++---
 3 files changed, 27 insertions(+), 67 deletions(-)

diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index 322523388cc..a43f35fdbbf 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -67,57 +67,17 @@ struct HashWithMixSeed<StringKey24>
     }
 };
 
-// struct StringHashTableHash
-// {
-//     using StringKey8Hasher = HashWithMixSeed<StringKey8>;
-//     using StringKey16Hasher = HashWithMixSeed<StringKey16>;
-//     using StringKey24Hasher = HashWithMixSeed<StringKey24>;
-//     using StringRefHasher = StringRefHash;
-// 
-//     static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
-// };
 struct StringHashTableHash
 {
-#if defined(__SSE4_2__)
-    static size_t ALWAYS_INLINE operator()(StringKey8 key)
-    {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key);
-        return res;
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
-    {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key.low);
-        res = _mm_crc32_u64(res, key.high);
-        return res;
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
-    {
-        size_t res = -1ULL;
-        res = _mm_crc32_u64(res, key.a);
-        res = _mm_crc32_u64(res, key.b);
-        res = _mm_crc32_u64(res, key.c);
-        return res;
-    }
-#else
-    static size_t ALWAYS_INLINE operator()(StringKey8 key)
-    {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
-    {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
-    }
-    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
-    {
-        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
-    }
-#endif
-    static size_t ALWAYS_INLINE operator()(StringRef key){ return StringRefHash()(key); }
+    using StringKey8Hasher = HashWithMixSeed<StringKey8>;
+    using StringKey16Hasher = HashWithMixSeed<StringKey16>;
+    using StringKey24Hasher = HashWithMixSeed<StringKey24>;
+    using StringRefHasher = StringRefHash;
+
+    static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
+    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
 };
 
 template <typename Cell>
@@ -612,7 +572,7 @@ struct StringHashTableSubMapSelector<0, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, false, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey8Hasher;
 
     static typename Data::T1 & getSubMap(size_t, Data & data) { return data.m1; }
 };
@@ -620,7 +580,7 @@ struct StringHashTableSubMapSelector<1, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, false, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey16Hasher;
 
     static typename Data::T2 & getSubMap(size_t, Data & data) { return data.m2; }
 };
@@ -628,7 +588,7 @@ struct StringHashTableSubMapSelector<2, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, false, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey24Hasher;
 
     static typename Data::T3 & getSubMap(size_t, Data & data) { return data.m3; }
 };
@@ -636,7 +596,7 @@ struct StringHashTableSubMapSelector<3, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, false, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringRefHasher;
 
     static typename Data::Ts & getSubMap(size_t, Data & data) { return data.ms; }
 };
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 403b8d3941c..ac2ab483e46 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -296,7 +296,7 @@ struct StringHashTableSubMapSelector<0, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, true, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey8Hasher;
 
     static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
@@ -308,7 +308,7 @@ struct StringHashTableSubMapSelector<1, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, true, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey16Hasher;
 
     static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
@@ -320,7 +320,7 @@ struct StringHashTableSubMapSelector<2, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, true, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringKey24Hasher;
 
     static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
@@ -332,7 +332,7 @@ struct StringHashTableSubMapSelector<3, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, true, Data>
 {
-    using Hash = StringHashTableHash;
+    using Hash = StringHashTableHash::StringRefHasher;
 
     static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 729ba863130..e3666b3d187 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -77,27 +77,27 @@ using AggregatedDataWithoutKey = AggregateDataPtr;
 using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
 using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
 
-using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
 
 using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
+using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
 
-using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
-using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,

From d2fba576ce82eaa6cd97620f1d4f17a173edc1d3 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 20:07:18 +0800
Subject: [PATCH 19/24] refine original code path

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h | 302 ++++++++++-----------------
 dbms/src/Interpreters/Aggregator.cpp | 206 +++++++-----------
 dbms/src/Interpreters/Aggregator.h   |   2 +-
 3 files changed, 192 insertions(+), 318 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index f5cc03d82c8..3c4fd601487 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -157,7 +157,7 @@ class HashMethodBase
         std::vector<String> & sort_key_containers)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
-        return findKeyImpl<false>(keyHolderGetKey(key_holder), data, 0);
+        return findKeyImpl(keyHolderGetKey(key_holder), data);
     }
 
     template <bool enable_prefetch = false, typename Data>
@@ -173,11 +173,11 @@ class HashMethodBase
         {
             assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
             prefetch(data, row, hashvals);
-            return emplaceImpl<true>(key_holder, data, hashvals[row]);
+            return emplaceImpl(key_holder, data, hashvals[row]);
         }
         else
         {
-            return emplaceImpl<false>(key_holder, data, 0);
+            return emplaceImpl(key_holder, data);
         }
     }
 
@@ -194,11 +194,11 @@ class HashMethodBase
         {
             assert(hashvals.size() == static_cast<Derived &>(*this).total_rows);
             prefetch(data, row, hashvals);
-            return findKeyImpl<true>(keyHolderGetKey(key_holder), data, hashvals[row]);
+            return findKeyImpl(keyHolderGetKey(key_holder), data, hashvals[row]);
         }
         else
         {
-            return findKeyImpl<false>(keyHolderGetKey(key_holder), data, 0);
+            return findKeyImpl(keyHolderGetKey(key_holder), data);
         }
     }
 
@@ -219,7 +219,7 @@ class HashMethodBase
         if constexpr (enable_prefetch)
             prefetch(submap, idx, hashvals);
 
-        return emplaceImpl<true>(datas[idx], submap, hashvals[idx]);
+        return emplaceImpl(datas[idx], submap, hashvals[idx]);
     }
 
     template <size_t SubMapIndex, bool enable_prefetch = false, typename Data, typename StringKeyType>
@@ -237,7 +237,7 @@ class HashMethodBase
         if constexpr (enable_prefetch)
             prefetch(submap, idx, hashvals);
 
-        return findKeyImpl<true>(keyHolderGetKey(datas[idx]), submap, hashvals[idx]);
+        return findKeyImpl(keyHolderGetKey(datas[idx]), submap, hashvals[idx]);
     }
 
     template <typename Data>
@@ -269,202 +269,128 @@ class HashMethodBase
         }
     }
 
+#define DEFINE_EMPLACE_IMPL_BEGIN                                                    \
+    if constexpr (Cache::consecutive_keys_optimization)                              \
+    {                                                                                \
+        if (cache.found && cache.check(keyHolderGetKey(key_holder)))                 \
+        {                                                                            \
+            if constexpr (has_mapped)                                                \
+                return EmplaceResult(cache.value.second, cache.value.second, false); \
+            else                                                                     \
+                return EmplaceResult(false);                                         \
+        }                                                                            \
+    }                                                                                \
+    typename Data::LookupResult it;                                                  \
+    bool inserted = false;
+
+#define DEFINE_EMPLACE_IMPL_END                                   \
+    [[maybe_unused]] Mapped * cached = nullptr;                   \
+    if constexpr (has_mapped)                                     \
+        cached = &it->getMapped();                                \
+                                                                  \
+    if (inserted)                                                 \
+    {                                                             \
+        if constexpr (has_mapped)                                 \
+        {                                                         \
+            new (&it->getMapped()) Mapped();                      \
+        }                                                         \
+    }                                                             \
+                                                                  \
+    if constexpr (consecutive_keys_optimization)                  \
+    {                                                             \
+        cache.found = true;                                       \
+        cache.empty = false;                                      \
+                                                                  \
+        if constexpr (has_mapped)                                 \
+        {                                                         \
+            cache.value.first = it->getKey();                     \
+            cache.value.second = it->getMapped();                 \
+            cached = &cache.value.second;                         \
+        }                                                         \
+        else                                                      \
+        {                                                         \
+            cache.value = it->getKey();                           \
+        }                                                         \
+    }                                                             \
+                                                                  \
+    if constexpr (has_mapped)                                     \
+        return EmplaceResult(it->getMapped(), *cached, inserted); \
+    else                                                          \
+        return EmplaceResult(inserted);
+
     template <typename Data, typename KeyHolder>
-    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
+    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
     {
-        if constexpr (Cache::consecutive_keys_optimization)
-        {
-            if (cache.found && cache.check(keyHolderGetKey(key_holder)))
-            {
-                if constexpr (has_mapped)
-                    return EmplaceResult(cache.value.second, cache.value.second, false);
-                else
-                    return EmplaceResult(false);
-            }
-        }
-
-        typename Data::LookupResult it;
-        bool inserted = false;
-
-        data.emplace(key_holder, it, inserted);
-
-        [[maybe_unused]] Mapped * cached = nullptr;
-        if constexpr (has_mapped)
-            cached = &it->getMapped();
-
-        if (inserted)
-        {
-            if constexpr (has_mapped)
-            {
-                new (&it->getMapped()) Mapped();
-            }
-        }
-
-        if constexpr (consecutive_keys_optimization)
-        {
-            cache.found = true;
-            cache.empty = false;
-
-            if constexpr (has_mapped)
-            {
-                cache.value.first = it->getKey();
-                cache.value.second = it->getMapped();
-                cached = &cache.value.second;
-            }
-            else
-            {
-                cache.value = it->getKey();
-            }
-        }
-
-        if constexpr (has_mapped)
-            return EmplaceResult(it->getMapped(), *cached, inserted);
-        else
-            return EmplaceResult(inserted);
+        DEFINE_EMPLACE_IMPL_BEGIN
+        data.emplace(key_holder, it, inserted, hashval);
+        DEFINE_EMPLACE_IMPL_END
     }
 
-    template <bool use_hashval, typename Data, typename KeyHolder>
-    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
+    template <typename Data, typename KeyHolder>
+    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
     {
-        if constexpr (Cache::consecutive_keys_optimization)
-        {
-            if (cache.found && cache.check(keyHolderGetKey(key_holder)))
-            {
-                if constexpr (has_mapped)
-                    return EmplaceResult(cache.value.second, cache.value.second, false);
-                else
-                    return EmplaceResult(false);
-            }
-        }
-
-        typename Data::LookupResult it;
-        bool inserted = false;
-
-        if constexpr (use_hashval)
-            data.emplace(key_holder, it, inserted, hashval);
-        else
-            data.emplace(key_holder, it, inserted);
-
-        [[maybe_unused]] Mapped * cached = nullptr;
-        if constexpr (has_mapped)
-            cached = &it->getMapped();
-
-        if (inserted)
-        {
-            if constexpr (has_mapped)
-            {
-                new (&it->getMapped()) Mapped();
-            }
-        }
-
-        if constexpr (consecutive_keys_optimization)
-        {
-            cache.found = true;
-            cache.empty = false;
-
-            if constexpr (has_mapped)
-            {
-                cache.value.first = it->getKey();
-                cache.value.second = it->getMapped();
-                cached = &cache.value.second;
-            }
-            else
-            {
-                cache.value = it->getKey();
-            }
-        }
-
-        if constexpr (has_mapped)
-            return EmplaceResult(it->getMapped(), *cached, inserted);
-        else
-            return EmplaceResult(inserted);
+        DEFINE_EMPLACE_IMPL_BEGIN
+        data.emplace(key_holder, it, inserted);
+        DEFINE_EMPLACE_IMPL_END
     }
+#undef DEFINE_EMPLACE_IMPL_BEGIN
+#undef DEFINE_EMPLACE_IMPL_END
+
+#define DEFINE_FIND_IMPL_BEGIN                                       \
+    if constexpr (Cache::consecutive_keys_optimization)              \
+    {                                                                \
+        if (cache.check(key))                                        \
+        {                                                            \
+            if constexpr (has_mapped)                                \
+                return FindResult(&cache.value.second, cache.found); \
+            else                                                     \
+                return FindResult(cache.found);                      \
+        }                                                            \
+    }                                                                \
+    typename Data::LookupResult it;
+
+#define DEFINE_FIND_IMPL_END                                               \
+    if constexpr (consecutive_keys_optimization)                           \
+    {                                                                      \
+        cache.found = it != nullptr;                                       \
+        cache.empty = false;                                               \
+                                                                           \
+        if constexpr (has_mapped)                                          \
+        {                                                                  \
+            cache.value.first = key;                                       \
+            if (it)                                                        \
+            {                                                              \
+                cache.value.second = it->getMapped();                      \
+            }                                                              \
+        }                                                                  \
+        else                                                               \
+        {                                                                  \
+            cache.value = key;                                             \
+        }                                                                  \
+    }                                                                      \
+                                                                           \
+    if constexpr (has_mapped)                                              \
+        return FindResult(it ? &it->getMapped() : nullptr, it != nullptr); \
+    else                                                                   \
+        return FindResult(it != nullptr);
 
     template <typename Data, typename Key>
     ALWAYS_INLINE inline FindResult findKeyImpl(Key & key, Data & data)
     {
-        if constexpr (Cache::consecutive_keys_optimization)
-        {
-            if (cache.check(key))
-            {
-                if constexpr (has_mapped)
-                    return FindResult(&cache.value.second, cache.found);
-                else
-                    return FindResult(cache.found);
-            }
-        }
-
-        typename Data::LookupResult it;
+        DEFINE_FIND_IMPL_BEGIN
         it = data.find(key);
-
-        if constexpr (consecutive_keys_optimization)
-        {
-            cache.found = it != nullptr;
-            cache.empty = false;
-
-            if constexpr (has_mapped)
-            {
-                cache.value.first = key;
-                if (it)
-                {
-                    cache.value.second = it->getMapped();
-                }
-            }
-            else
-            {
-                cache.value = key;
-            }
-        }
-
-        if constexpr (has_mapped)
-            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr);
-        else
-            return FindResult(it != nullptr);
+        DEFINE_FIND_IMPL_END
     }
-    template <bool use_hashval, typename Data, typename Key>
+
+    template <typename Data, typename Key>
     ALWAYS_INLINE inline FindResult findKeyImpl(Key & key, Data & data, size_t hashval)
     {
-        if constexpr (Cache::consecutive_keys_optimization)
-        {
-            if (cache.check(key))
-            {
-                if constexpr (has_mapped)
-                    return FindResult(&cache.value.second, cache.found);
-                else
-                    return FindResult(cache.found);
-            }
-        }
-
-        typename Data::LookupResult it;
-        if constexpr (use_hashval)
-            it = data.find(key, hashval);
-        else
-            it = data.find(key);
-
-        if constexpr (consecutive_keys_optimization)
-        {
-            cache.found = it != nullptr;
-            cache.empty = false;
-
-            if constexpr (has_mapped)
-            {
-                cache.value.first = key;
-                if (it)
-                {
-                    cache.value.second = it->getMapped();
-                }
-            }
-            else
-            {
-                cache.value = key;
-            }
-        }
-
-        if constexpr (has_mapped)
-            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr);
-        else
-            return FindResult(it != nullptr);
+        DEFINE_FIND_IMPL_BEGIN
+        it = data.find(key, hashval);
+        DEFINE_FIND_IMPL_END
     }
+#undef DEFINE_FIND_IMPL_BEGIN
+#undef DEFINE_FIND_IMPL_END
 };
 
 
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 3368a3e9bfe..8e12e7383ab 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -681,7 +681,7 @@ void NO_INLINE Aggregator::executeImpl(
         //         aggregates_pool,
         //         agg_process_info);
         // else
-            executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+        executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
     }
     else
     {
@@ -713,7 +713,7 @@ std::vector<size_t> getHashVals(
     return hashvals;
 }
 
-template <bool only_lookup, bool enable_prefetch, typename Method>
+template <bool only_lookup, typename Method>
 std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> Aggregator::emplaceOrFindKey(
     Method & method,
     typename Method::State & state,
@@ -725,10 +725,14 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     try
     {
         if constexpr (only_lookup)
-            return state
-                .template findKey<enable_prefetch>(method.data, index, aggregates_pool, sort_key_containers, hashvals);
+            return state.template findKey</*enable_prefetch*/true>(
+                method.data,
+                index,
+                aggregates_pool,
+                sort_key_containers,
+                hashvals);
         else
-            return state.template emplaceKey<enable_prefetch>(
+            return state.template emplaceKey</*enable_prefetch*/true>(
                 method.data,
                 index,
                 aggregates_pool,
@@ -875,26 +879,16 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
         /// For all rows.
         AggregateDataPtr place = aggregates_pool->alloc(0);
         std::vector<size_t> hashvals;
-        if constexpr (enable_prefetch)
-        {
-            hashvals = getHashVals(
-                agg_process_info.start_row,
-                agg_process_info.end_row,
-                method.data,
-                state,
-                sort_key_containers,
-                aggregates_pool);
-        }
 
         for (size_t i = 0; i < rows; ++i)
         {
-            auto emplace_result_hold = emplaceOrFindKey<only_lookup, enable_prefetch>(
+            // TODO prefetch
+            auto emplace_result_hold = emplaceOrFindKey<only_lookup>(
                 method,
                 state,
                 agg_process_info.start_row,
                 *aggregates_pool,
-                sort_key_containers,
-                hashvals);
+                sort_key_containers);
             if likely (emplace_result_hold.has_value())
             {
                 if constexpr (collect_hit_rate)
@@ -958,6 +952,56 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     /// Generic case.
     std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
     std::optional<size_t> processed_rows;
+
+#define WRAP_EMPLACE_AGG_KEY_BEGIN                                                          \
+    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i) \
+    {                                                                                       \
+        AggregateDataPtr aggregate_data = nullptr;
+
+#define WRAP_EMPLACE_AGG_KEY_END                                                                                    \
+        if unlikely (!emplace_result_holder.has_value())                                                                \
+        {                                                                                                               \
+            LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");         \
+            break;                                                                                                      \
+        }                                                                                                               \
+                                                                                                                        \
+        auto & emplace_result = emplace_result_holder.value();                                                          \
+                                                                                                                        \
+        if constexpr (only_lookup)                                                                                      \
+        {                                                                                                               \
+            if (emplace_result.isFound())                                                                               \
+            {                                                                                                           \
+                aggregate_data = emplace_result.getMapped();                                                            \
+            }                                                                                                           \
+            else                                                                                                        \
+            {                                                                                                           \
+                agg_process_info.not_found_rows.push_back(i);                                                           \
+            }                                                                                                           \
+        }                                                                                                               \
+        else                                                                                                            \
+        {                                                                                                               \
+            if (emplace_result.isInserted())                                                                            \
+            {                                                                                                           \
+                emplace_result.setMapped(nullptr);                                                                      \
+                                                                                                                        \
+                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); \
+                createAggregateStates(aggregate_data);                                                                  \
+                                                                                                                        \
+                emplace_result.setMapped(aggregate_data);                                                               \
+            }                                                                                                           \
+            else                                                                                                        \
+            {                                                                                                           \
+                aggregate_data = emplace_result.getMapped();                                                            \
+                                                                                                                        \
+                if constexpr (collect_hit_rate)                                                                         \
+                    ++agg_process_info.hit_row_cnt;                                                                     \
+            }                                                                                                           \
+        }                                                                                                               \
+                                                                                                                        \
+        places[i - agg_process_info.start_row] = aggregate_data;                                                        \
+        processed_rows = i;                                                                                             \
+    }
+
     if constexpr (enable_prefetch)
     {
         std::vector<size_t> hashvals;
@@ -969,121 +1013,25 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
             sort_key_containers,
             aggregates_pool);
 
-        for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
-        {
-            AggregateDataPtr aggregate_data = nullptr;
-
-            auto emplace_result_holder = emplaceOrFindKey<only_lookup, enable_prefetch>(
-                method,
-                state,
-                i,
-                *aggregates_pool,
-                sort_key_containers,
-                hashvals);
-            if unlikely (!emplace_result_holder.has_value())
-            {
-                LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
-                break;
-            }
-
-            auto & emplace_result = emplace_result_holder.value();
-
-            if constexpr (only_lookup)
-            {
-                if (emplace_result.isFound())
-                {
-                    aggregate_data = emplace_result.getMapped();
-                }
-                else
-                {
-                    agg_process_info.not_found_rows.push_back(i);
-                }
-            }
-            else
-            {
-                /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-                if (emplace_result.isInserted())
-                {
-                    /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                    emplace_result.setMapped(nullptr);
-
-                    aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                    createAggregateStates(aggregate_data);
-
-                    emplace_result.setMapped(aggregate_data);
-                }
-                else
-                {
-                    aggregate_data = emplace_result.getMapped();
-
-                    if constexpr (collect_hit_rate)
-                        ++agg_process_info.hit_row_cnt;
-                }
-            }
-
-            places[i - agg_process_info.start_row] = aggregate_data;
-            processed_rows = i;
-        }
+        WRAP_EMPLACE_AGG_KEY_BEGIN
+        auto emplace_result_holder = emplaceOrFindKey<only_lookup>(
+            method,
+            state,
+            i,
+            *aggregates_pool,
+            sort_key_containers,
+            hashvals);
+        WRAP_EMPLACE_AGG_KEY_END
     }
     else
     {
-        LOG_DEBUG(log, "gjt debug original path");
-        for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
-        {
-            AggregateDataPtr aggregate_data = nullptr;
-
-            auto emplace_result_holder = emplaceOrFindKey<only_lookup>(
-                method,
-                state,
-                i,
-                *aggregates_pool,
-                sort_key_containers);
-            if unlikely (!emplace_result_holder.has_value())
-            {
-                LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
-                break;
-            }
-
-            auto & emplace_result = emplace_result_holder.value();
-
-            if constexpr (only_lookup)
-            {
-                if (emplace_result.isFound())
-                {
-                    aggregate_data = emplace_result.getMapped();
-                }
-                else
-                {
-                    agg_process_info.not_found_rows.push_back(i);
-                }
-            }
-            else
-            {
-                /// If a new key is inserted, initialize the states of the aggregate functions, and possibly something related to the key.
-                if (emplace_result.isInserted())
-                {
-                    /// exception-safety - if you can not allocate memory or create states, then destructors will not be called.
-                    emplace_result.setMapped(nullptr);
-
-                    aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
-                    createAggregateStates(aggregate_data);
-
-                    emplace_result.setMapped(aggregate_data);
-                }
-                else
-                {
-                    aggregate_data = emplace_result.getMapped();
-
-                    if constexpr (collect_hit_rate)
-                        ++agg_process_info.hit_row_cnt;
-                }
-            }
-
-            places[i - agg_process_info.start_row] = aggregate_data;
-            processed_rows = i;
-        }
+        WRAP_EMPLACE_AGG_KEY_BEGIN
+        auto emplace_result_holder
+            = emplaceOrFindKey<only_lookup>(method, state, i, *aggregates_pool, sort_key_containers);
+        WRAP_EMPLACE_AGG_KEY_END
     }
-
+#undef WRAP_EMPLACE_AGG_KEY_BEGIN
+#undef WRAP_EMPLACE_AGG_KEY_END
 
     if (processed_rows)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index e3666b3d187..eb6dfed68f9 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1490,7 +1490,7 @@ class Aggregator
         Arena * aggregates_pool,
         AggProcessInfo & agg_process_info) const;
 
-    template <bool only_lookup, bool enable_prefetch, typename Method>
+    template <bool only_lookup, typename Method>
     std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::ResultType> emplaceOrFindKey(
         Method & method,
         typename Method::State & state,

From 71b6ecd5fa3c664a32a47513683dd90c96c3c54e Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 20:58:50 +0800
Subject: [PATCH 20/24] Reapply "revert new hasher"

This reverts commit 16937ff2cdd5e3fb7cde826887c0bfd252940bf3.
---
 dbms/src/Common/HashTable/StringHashTable.h   | 66 +++++++++++++++----
 .../HashTable/TwoLevelStringHashTable.h       |  8 +--
 dbms/src/Interpreters/Aggregator.h            | 20 +++---
 3 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index a43f35fdbbf..322523388cc 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -67,17 +67,57 @@ struct HashWithMixSeed<StringKey24>
     }
 };
 
+// struct StringHashTableHash
+// {
+//     using StringKey8Hasher = HashWithMixSeed<StringKey8>;
+//     using StringKey16Hasher = HashWithMixSeed<StringKey16>;
+//     using StringKey24Hasher = HashWithMixSeed<StringKey24>;
+//     using StringRefHasher = StringRefHash;
+// 
+//     static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
+//     static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
+// };
 struct StringHashTableHash
 {
-    using StringKey8Hasher = HashWithMixSeed<StringKey8>;
-    using StringKey16Hasher = HashWithMixSeed<StringKey16>;
-    using StringKey24Hasher = HashWithMixSeed<StringKey24>;
-    using StringRefHasher = StringRefHash;
-
-    static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
-    static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
+#if defined(__SSE4_2__)
+    static size_t ALWAYS_INLINE operator()(StringKey8 key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key);
+        return res;
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.low);
+        res = _mm_crc32_u64(res, key.high);
+        return res;
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
+    {
+        size_t res = -1ULL;
+        res = _mm_crc32_u64(res, key.a);
+        res = _mm_crc32_u64(res, key.b);
+        res = _mm_crc32_u64(res, key.c);
+        return res;
+    }
+#else
+    static size_t ALWAYS_INLINE operator()(StringKey8 key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey16 & key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
+    }
+    static size_t ALWAYS_INLINE operator()(const StringKey24 & key)
+    {
+        return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
+    }
+#endif
+    static size_t ALWAYS_INLINE operator()(StringRef key){ return StringRefHash()(key); }
 };
 
 template <typename Cell>
@@ -572,7 +612,7 @@ struct StringHashTableSubMapSelector<0, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey8Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T1 & getSubMap(size_t, Data & data) { return data.m1; }
 };
@@ -580,7 +620,7 @@ struct StringHashTableSubMapSelector<1, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey16Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T2 & getSubMap(size_t, Data & data) { return data.m2; }
 };
@@ -588,7 +628,7 @@ struct StringHashTableSubMapSelector<2, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, false, Data>
 {
-    using Hash = StringHashTableHash::StringKey24Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::T3 & getSubMap(size_t, Data & data) { return data.m3; }
 };
@@ -596,7 +636,7 @@ struct StringHashTableSubMapSelector<3, false, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, false, Data>
 {
-    using Hash = StringHashTableHash::StringRefHasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Ts & getSubMap(size_t, Data & data) { return data.ms; }
 };
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index ac2ab483e46..403b8d3941c 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -296,7 +296,7 @@ struct StringHashTableSubMapSelector<0, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey8Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
@@ -308,7 +308,7 @@ struct StringHashTableSubMapSelector<1, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey16Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
@@ -320,7 +320,7 @@ struct StringHashTableSubMapSelector<2, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, true, Data>
 {
-    using Hash = StringHashTableHash::StringKey24Hasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
@@ -332,7 +332,7 @@ struct StringHashTableSubMapSelector<3, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, true, Data>
 {
-    using Hash = StringHashTableHash::StringRefHasher;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index eb6dfed68f9..81252b8b3c6 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -77,27 +77,27 @@ using AggregatedDataWithoutKey = AggregateDataPtr;
 using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize<UInt8, AggregateDataPtr>;
 using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap<UInt16, AggregateDataPtr>;
 
-using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
-using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
+using AggregatedDataWithUInt32Key = HashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
 
 using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
+using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
 
-using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
-using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
+using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
+using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,

From 40ceb089d4535895456461465279cdd6a09c975f Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Tue, 3 Dec 2024 21:23:36 +0800
Subject: [PATCH 21/24] one level old hash; two level new hash

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/HashTable/HashTable.h         |  3 +-
 dbms/src/Common/HashTable/TwoLevelHashTable.h |  9 +++---
 .../HashTable/TwoLevelStringHashTable.h       | 30 +++++++++++++------
 dbms/src/Interpreters/Aggregator.h            | 30 +++++++++----------
 dbms/src/Interpreters/Settings.h              |  2 +-
 5 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index c0f066edbb0..5496f263dde 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -1020,7 +1020,8 @@ class HashTable
     }
 
     /// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
-    void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value)
+    template <typename InsertCellType>
+    void ALWAYS_INLINE insertUniqueNonZero(const InsertCellType * cell, size_t hash_value)
     {
         size_t place_value = findEmptyCell(grower.place(hash_value));
 
diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h
index 75a5402363d..8eb22f851eb 100644
--- a/dbms/src/Common/HashTable/TwoLevelHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h
@@ -115,9 +115,9 @@ class TwoLevelHashTable : private boost::noncopyable
 
     /// Copy the data from another (normal) hash table. It should have the same hash function.
     template <typename Source>
-    explicit TwoLevelHashTable(const Source & src)
+    explicit TwoLevelHashTable(Source & src)
     {
-        typename Source::const_iterator it = src.begin();
+        typename Source::iterator it = src.begin();
 
         /// It is assumed that the zero key (stored separately) is first in iteration order.
         if (it != src.end() && it.getPtr()->isZero(src))
@@ -128,8 +128,9 @@ class TwoLevelHashTable : private boost::noncopyable
 
         for (; it != src.end(); ++it)
         {
-            const Cell * cell = it.getPtr();
-            size_t hash_value = cell->getHash(src);
+            auto * cell = it.getPtr();
+            // size_t hash_value = cell->getHash(src);
+            size_t hash_value = Hash::operator()(cell->getKey());
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].insertUniqueNonZero(cell, hash_value);
         }
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 403b8d3941c..526de846fef 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -65,32 +65,40 @@ class TwoLevelStringHashTable : private boost::noncopyable
     TwoLevelStringHashTable() = default;
 
     template <typename Source>
-    explicit TwoLevelStringHashTable(const Source & src)
+    explicit TwoLevelStringHashTable(Source & src)
     {
         if (src.m0.hasZero())
             impls[0].m0.setHasZero(*src.m0.zeroValue());
 
         for (auto & v : src.m1)
         {
-            size_t hash_value = v.getHash(src.m1);
+            // size_t hash_value = v.getHash(src.m1);
+            const size_t hash_value = ImplTable::T1::Hash::operator()(v.getKey());
+            v.setHash(hash_value);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m1.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.m2)
         {
-            size_t hash_value = v.getHash(src.m2);
+            // size_t hash_value = v.getHash(src.m2);
+            const size_t hash_value = ImplTable::T2::Hash::operator()(v.getKey());
+            v.setHash(hash_value);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m2.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.m3)
         {
-            size_t hash_value = v.getHash(src.m3);
+            // size_t hash_value = v.getHash(src.m3);
+            const size_t hash_value = ImplTable::T3::Hash::operator()(v.getKey());
+            v.setHash(hash_value);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m3.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.ms)
         {
-            size_t hash_value = v.getHash(src.ms);
+            // size_t hash_value = v.getHash(src.ms);
+            const size_t hash_value = ImplTable::Ts::Hash::operator()(v.getKey());
+            v.setHash(hash_value);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].ms.insertUniqueNonZero(&v, hash_value);
         }
@@ -296,7 +304,8 @@ struct StringHashTableSubMapSelector<0, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, true, Data>
 {
-    using Hash = StringHashTableHash;
+    // using Hash = StringHashTableHash;
+    using Hash = HashWithMixSeed<StringKey8>;
 
     static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
@@ -308,7 +317,8 @@ struct StringHashTableSubMapSelector<1, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, true, Data>
 {
-    using Hash = StringHashTableHash;
+    // using Hash = StringHashTableHash;
+    using Hash = HashWithMixSeed<StringKey16>;
 
     static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
@@ -320,7 +330,8 @@ struct StringHashTableSubMapSelector<2, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, true, Data>
 {
-    using Hash = StringHashTableHash;
+    // using Hash = StringHashTableHash;
+    using Hash = HashWithMixSeed<StringKey24>;
 
     static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
@@ -332,7 +343,8 @@ struct StringHashTableSubMapSelector<3, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, true, Data>
 {
-    using Hash = StringHashTableHash;
+    // using Hash = StringHashTableHash;
+    using Hash = StringRefHash;
 
     static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 81252b8b3c6..d900cc3e231 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -88,16 +88,16 @@ using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<
 using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
 using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,
@@ -125,7 +125,7 @@ struct AggregationMethodOneNumber
     AggregationMethodOneNumber() = default;
 
     template <typename Other>
-    explicit AggregationMethodOneNumber(const Other & other)
+    explicit AggregationMethodOneNumber(Other & other)
         : data(other.data)
     {}
 
@@ -179,7 +179,7 @@ struct AggregationMethodString
     AggregationMethodString() = default;
 
     template <typename Other>
-    explicit AggregationMethodString(const Other & other)
+    explicit AggregationMethodString(Other & other)
         : data(other.data)
     {}
 
@@ -227,7 +227,7 @@ struct AggregationMethodStringNoCache
     AggregationMethodStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodStringNoCache(const Other & other)
+    explicit AggregationMethodStringNoCache(Other & other)
         : data(other.data)
     {}
 
@@ -275,7 +275,7 @@ struct AggregationMethodOneKeyStringNoCache
     AggregationMethodOneKeyStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodOneKeyStringNoCache(const Other & other)
+    explicit AggregationMethodOneKeyStringNoCache(Other & other)
         : data(other.data)
     {}
 
@@ -325,7 +325,7 @@ struct AggregationMethodMultiStringNoCache
     AggregationMethodMultiStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodMultiStringNoCache(const Other & other)
+    explicit AggregationMethodMultiStringNoCache(Other & other)
         : data(other.data)
     {}
 
@@ -355,7 +355,7 @@ struct AggregationMethodFastPathTwoKeysNoCache
     AggregationMethodFastPathTwoKeysNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodFastPathTwoKeysNoCache(const Other & other)
+    explicit AggregationMethodFastPathTwoKeysNoCache(Other & other)
         : data(other.data)
     {}
 
@@ -475,7 +475,7 @@ struct AggregationMethodFixedString
     AggregationMethodFixedString() = default;
 
     template <typename Other>
-    explicit AggregationMethodFixedString(const Other & other)
+    explicit AggregationMethodFixedString(Other & other)
         : data(other.data)
     {}
 
@@ -523,7 +523,7 @@ struct AggregationMethodFixedStringNoCache
     AggregationMethodFixedStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodFixedStringNoCache(const Other & other)
+    explicit AggregationMethodFixedStringNoCache(Other & other)
         : data(other.data)
     {}
 
@@ -572,7 +572,7 @@ struct AggregationMethodKeysFixed
     AggregationMethodKeysFixed() = default;
 
     template <typename Other>
-    explicit AggregationMethodKeysFixed(const Other & other)
+    explicit AggregationMethodKeysFixed(Other & other)
         : data(other.data)
     {}
 
@@ -679,7 +679,7 @@ struct AggregationMethodSerialized
     AggregationMethodSerialized() = default;
 
     template <typename Other>
-    explicit AggregationMethodSerialized(const Other & other)
+    explicit AggregationMethodSerialized(Other & other)
         : data(other.data)
     {}
 
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 4c2e5dbeca4..5f46d74fd30 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -84,7 +84,7 @@ struct Settings
     M(SettingLoadBalancing, load_balancing, LoadBalancing::RANDOM, "Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing.")                                          \
                                                                                                                                                                                                                                         \
     M(SettingUInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.")                                                                                   \
-    M(SettingUInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. "                                       \
+    M(SettingUInt64, group_by_two_level_threshold_bytes, 32000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. "                                       \
                                                                     "Two-level aggregation is used when at least one of the thresholds is triggered.")                                                                                  \
     M(SettingUInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is "                                   \
                                                                     "consumed. 0 means - same as 'max_threads'.")                                                                                                                       \

From 4cb24c21fcba3cb700cb6cbdd5d978e7fe214ad3 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Wed, 4 Dec 2024 10:40:11 +0800
Subject: [PATCH 22/24] Revert "one level old hash; two level new hash"

This reverts commit 40ceb089d4535895456461465279cdd6a09c975f.
---
 dbms/src/Common/HashTable/HashTable.h         |  3 +-
 dbms/src/Common/HashTable/TwoLevelHashTable.h |  9 +++---
 .../HashTable/TwoLevelStringHashTable.h       | 30 ++++++-------------
 dbms/src/Interpreters/Aggregator.h            | 30 +++++++++----------
 dbms/src/Interpreters/Settings.h              |  2 +-
 5 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h
index 5496f263dde..c0f066edbb0 100644
--- a/dbms/src/Common/HashTable/HashTable.h
+++ b/dbms/src/Common/HashTable/HashTable.h
@@ -1020,8 +1020,7 @@ class HashTable
     }
 
     /// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
-    template <typename InsertCellType>
-    void ALWAYS_INLINE insertUniqueNonZero(const InsertCellType * cell, size_t hash_value)
+    void ALWAYS_INLINE insertUniqueNonZero(const Cell * cell, size_t hash_value)
     {
         size_t place_value = findEmptyCell(grower.place(hash_value));
 
diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h
index 8eb22f851eb..75a5402363d 100644
--- a/dbms/src/Common/HashTable/TwoLevelHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h
@@ -115,9 +115,9 @@ class TwoLevelHashTable : private boost::noncopyable
 
     /// Copy the data from another (normal) hash table. It should have the same hash function.
     template <typename Source>
-    explicit TwoLevelHashTable(Source & src)
+    explicit TwoLevelHashTable(const Source & src)
     {
-        typename Source::iterator it = src.begin();
+        typename Source::const_iterator it = src.begin();
 
         /// It is assumed that the zero key (stored separately) is first in iteration order.
         if (it != src.end() && it.getPtr()->isZero(src))
@@ -128,9 +128,8 @@ class TwoLevelHashTable : private boost::noncopyable
 
         for (; it != src.end(); ++it)
         {
-            auto * cell = it.getPtr();
-            // size_t hash_value = cell->getHash(src);
-            size_t hash_value = Hash::operator()(cell->getKey());
+            const Cell * cell = it.getPtr();
+            size_t hash_value = cell->getHash(src);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].insertUniqueNonZero(cell, hash_value);
         }
diff --git a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
index 526de846fef..403b8d3941c 100644
--- a/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
+++ b/dbms/src/Common/HashTable/TwoLevelStringHashTable.h
@@ -65,40 +65,32 @@ class TwoLevelStringHashTable : private boost::noncopyable
     TwoLevelStringHashTable() = default;
 
     template <typename Source>
-    explicit TwoLevelStringHashTable(Source & src)
+    explicit TwoLevelStringHashTable(const Source & src)
     {
         if (src.m0.hasZero())
             impls[0].m0.setHasZero(*src.m0.zeroValue());
 
         for (auto & v : src.m1)
         {
-            // size_t hash_value = v.getHash(src.m1);
-            const size_t hash_value = ImplTable::T1::Hash::operator()(v.getKey());
-            v.setHash(hash_value);
+            size_t hash_value = v.getHash(src.m1);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m1.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.m2)
         {
-            // size_t hash_value = v.getHash(src.m2);
-            const size_t hash_value = ImplTable::T2::Hash::operator()(v.getKey());
-            v.setHash(hash_value);
+            size_t hash_value = v.getHash(src.m2);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m2.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.m3)
         {
-            // size_t hash_value = v.getHash(src.m3);
-            const size_t hash_value = ImplTable::T3::Hash::operator()(v.getKey());
-            v.setHash(hash_value);
+            size_t hash_value = v.getHash(src.m3);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].m3.insertUniqueNonZero(&v, hash_value);
         }
         for (auto & v : src.ms)
         {
-            // size_t hash_value = v.getHash(src.ms);
-            const size_t hash_value = ImplTable::Ts::Hash::operator()(v.getKey());
-            v.setHash(hash_value);
+            size_t hash_value = v.getHash(src.ms);
             size_t buck = getBucketFromHash(hash_value);
             impls[buck].ms.insertUniqueNonZero(&v, hash_value);
         }
@@ -304,8 +296,7 @@ struct StringHashTableSubMapSelector<0, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<1, true, Data>
 {
-    // using Hash = StringHashTableHash;
-    using Hash = HashWithMixSeed<StringKey8>;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T1 & getSubMap(size_t hashval, Data & data)
     {
@@ -317,8 +308,7 @@ struct StringHashTableSubMapSelector<1, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<2, true, Data>
 {
-    // using Hash = StringHashTableHash;
-    using Hash = HashWithMixSeed<StringKey16>;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T2 & getSubMap(size_t hashval, Data & data)
     {
@@ -330,8 +320,7 @@ struct StringHashTableSubMapSelector<2, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<3, true, Data>
 {
-    // using Hash = StringHashTableHash;
-    using Hash = HashWithMixSeed<StringKey24>;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::T3 & getSubMap(size_t hashval, Data & data)
     {
@@ -343,8 +332,7 @@ struct StringHashTableSubMapSelector<3, true, Data>
 template <typename Data>
 struct StringHashTableSubMapSelector<4, true, Data>
 {
-    // using Hash = StringHashTableHash;
-    using Hash = StringRefHash;
+    using Hash = StringHashTableHash;
 
     static typename Data::Impl::Ts & getSubMap(size_t hashval, Data & data)
     {
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index d900cc3e231..81252b8b3c6 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -88,16 +88,16 @@ using AggregatedDataWithInt256Key = HashMap<Int256, AggregateDataPtr, HashCRC32<
 using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
 using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
-using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashWithMixSeed<UInt32>>;
-using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashWithMixSeed<UInt64>>;
+using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap<UInt32, AggregateDataPtr, HashCRC32<UInt32>>;
+using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
 
-using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashWithMixSeed<Int256>>;
+using AggregatedDataWithInt256KeyTwoLevel = TwoLevelHashMap<Int256, AggregateDataPtr, HashCRC32<Int256>>;
 
 using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
 using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
 
-using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashWithMixSeed<UInt128>>;
-using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashWithMixSeed<UInt256>>;
+using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, HashCRC32<UInt128>>;
+using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, HashCRC32<UInt256>>;
 
 /** Variants with better hash function, using more than 32 bits for hash.
   * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion,
@@ -125,7 +125,7 @@ struct AggregationMethodOneNumber
     AggregationMethodOneNumber() = default;
 
     template <typename Other>
-    explicit AggregationMethodOneNumber(Other & other)
+    explicit AggregationMethodOneNumber(const Other & other)
         : data(other.data)
     {}
 
@@ -179,7 +179,7 @@ struct AggregationMethodString
     AggregationMethodString() = default;
 
     template <typename Other>
-    explicit AggregationMethodString(Other & other)
+    explicit AggregationMethodString(const Other & other)
         : data(other.data)
     {}
 
@@ -227,7 +227,7 @@ struct AggregationMethodStringNoCache
     AggregationMethodStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodStringNoCache(Other & other)
+    explicit AggregationMethodStringNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -275,7 +275,7 @@ struct AggregationMethodOneKeyStringNoCache
     AggregationMethodOneKeyStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodOneKeyStringNoCache(Other & other)
+    explicit AggregationMethodOneKeyStringNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -325,7 +325,7 @@ struct AggregationMethodMultiStringNoCache
     AggregationMethodMultiStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodMultiStringNoCache(Other & other)
+    explicit AggregationMethodMultiStringNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -355,7 +355,7 @@ struct AggregationMethodFastPathTwoKeysNoCache
     AggregationMethodFastPathTwoKeysNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodFastPathTwoKeysNoCache(Other & other)
+    explicit AggregationMethodFastPathTwoKeysNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -475,7 +475,7 @@ struct AggregationMethodFixedString
     AggregationMethodFixedString() = default;
 
     template <typename Other>
-    explicit AggregationMethodFixedString(Other & other)
+    explicit AggregationMethodFixedString(const Other & other)
         : data(other.data)
     {}
 
@@ -523,7 +523,7 @@ struct AggregationMethodFixedStringNoCache
     AggregationMethodFixedStringNoCache() = default;
 
     template <typename Other>
-    explicit AggregationMethodFixedStringNoCache(Other & other)
+    explicit AggregationMethodFixedStringNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -572,7 +572,7 @@ struct AggregationMethodKeysFixed
     AggregationMethodKeysFixed() = default;
 
     template <typename Other>
-    explicit AggregationMethodKeysFixed(Other & other)
+    explicit AggregationMethodKeysFixed(const Other & other)
         : data(other.data)
     {}
 
@@ -679,7 +679,7 @@ struct AggregationMethodSerialized
     AggregationMethodSerialized() = default;
 
     template <typename Other>
-    explicit AggregationMethodSerialized(Other & other)
+    explicit AggregationMethodSerialized(const Other & other)
         : data(other.data)
     {}
 
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 5f46d74fd30..4c2e5dbeca4 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -84,7 +84,7 @@ struct Settings
     M(SettingLoadBalancing, load_balancing, LoadBalancing::RANDOM, "Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing.")                                          \
                                                                                                                                                                                                                                         \
     M(SettingUInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.")                                                                                   \
-    M(SettingUInt64, group_by_two_level_threshold_bytes, 32000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. "                                       \
+    M(SettingUInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. "                                       \
                                                                     "Two-level aggregation is used when at least one of the thresholds is triggered.")                                                                                  \
     M(SettingUInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is "                                   \
                                                                     "consumed. 0 means - same as 'max_threads'.")                                                                                                                       \

From c02cf71e90ea12a93dceaea8396e6f1daad49a37 Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Wed, 4 Dec 2024 10:58:11 +0800
Subject: [PATCH 23/24] revert new hasher; refine original code path

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 dbms/src/Common/ColumnsHashingImpl.h        |   3 +
 dbms/src/Common/HashTable/Hash.h            | 125 -----------
 dbms/src/Common/HashTable/StringHashTable.h |  31 +--
 dbms/src/Interpreters/Aggregator.cpp        | 227 ++++++++++----------
 4 files changed, 121 insertions(+), 265 deletions(-)

diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index 3c4fd601487..fcbfc4bc358 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -138,6 +138,7 @@ class HashMethodBase
             map.prefetch(hashvals[prefetch_idx]);
     }
 
+    // Emplace key without hashval, and this method doesn't support prefetch.
     template <typename Data>
     ALWAYS_INLINE inline EmplaceResult emplaceKey(
         Data & data,
@@ -160,6 +161,7 @@ class HashMethodBase
         return findKeyImpl(keyHolderGetKey(key_holder), data);
     }
 
+    // Emplace key using hashval, you can enable prefetch or not.
     template <bool enable_prefetch = false, typename Data>
     ALWAYS_INLINE inline EmplaceResult emplaceKey(
         Data & data,
@@ -318,6 +320,7 @@ class HashMethodBase
     else                                                          \
         return EmplaceResult(inserted);
 
+    // This method is performance critical, so there are two emplaceImpl to make sure caller can use the one they need.
     template <typename Data, typename KeyHolder>
     ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data, size_t hashval)
     {
diff --git a/dbms/src/Common/HashTable/Hash.h b/dbms/src/Common/HashTable/Hash.h
index 207919a347e..457b4b9f3c0 100644
--- a/dbms/src/Common/HashTable/Hash.h
+++ b/dbms/src/Common/HashTable/Hash.h
@@ -422,128 +422,3 @@ struct IntHash32<T, salt, std::enable_if_t<!is_fit_register<T>, void>>
         }
     }
 };
-
-inline uint64_t umul128(uint64_t v, uint64_t kmul, uint64_t * high)
-{
-    DB::Int128 res = static_cast<DB::Int128>(v) * static_cast<DB::Int128>(kmul);
-    *high = static_cast<uint64_t>(res >> 64);
-    return static_cast<uint64_t>(res);
-}
-
-template <typename T>
-inline void hash_combine(uint64_t & seed, const T & val)
-{
-    // from: https://github.com/HowardHinnant/hash_append/issues/7#issuecomment-629414712
-    seed ^= std::hash<T>{}(val) + 0x9e3779b97f4a7c15LLU + (seed << 12) + (seed >> 4);
-}
-
-inline uint64_t hash_int128(uint64_t seed, const DB::Int128 & v)
-{
-    auto low = static_cast<uint64_t>(v);
-    auto high = static_cast<uint64_t>(v >> 64);
-    hash_combine(seed, low);
-    hash_combine(seed, high);
-    return seed;
-}
-
-inline uint64_t hash_uint128(uint64_t seed, const DB::UInt128 & v)
-{
-    hash_combine(seed, v.low);
-    hash_combine(seed, v.high);
-    return seed;
-}
-
-inline uint64_t hash_int256(uint64_t seed, const DB::Int256 & v)
-{
-    const auto & backend_value = v.backend();
-    for (size_t i = 0; i < backend_value.size(); ++i)
-    {
-        hash_combine(seed, backend_value.limbs()[i]);
-    }
-    return seed;
-}
-
-inline uint64_t hash_uint256(uint64_t seed, const DB::UInt256 & v)
-{
-    hash_combine(seed, v.a);
-    hash_combine(seed, v.b);
-    hash_combine(seed, v.c);
-    hash_combine(seed, v.d);
-    return seed;
-}
-
-template <size_t n>
-struct HashWithMixSeedHelper
-{
-    static inline size_t operator()(size_t);
-};
-
-template <>
-struct HashWithMixSeedHelper<4>
-{
-    static inline size_t operator()(size_t v)
-    {
-        // from: https://github.com/aappleby/smhasher/blob/0ff96f7835817a27d0487325b6c16033e2992eb5/src/MurmurHash3.cpp#L102
-        static constexpr uint64_t kmul = 0xcc9e2d51UL;
-        uint64_t mul = v * kmul;
-        return static_cast<size_t>(mul ^ (mul >> 32u));
-    }
-};
-
-template <>
-struct HashWithMixSeedHelper<8>
-{
-    static inline size_t operator()(size_t v)
-    {
-        // from: https://github.com/martinus/robin-hood-hashing/blob/b21730713f4b5296bec411917c46919f7b38b178/src/include/robin_hood.h#L735
-        static constexpr uint64_t kmul = 0xde5fb9d2630458e9ULL;
-        uint64_t high = 0;
-        uint64_t low = umul128(v, kmul, &high);
-        return static_cast<size_t>(high + low);
-    }
-};
-
-template <typename T>
-struct HashWithMixSeed
-{
-    static size_t operator()(const T & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(std::hash<T>()(v));
-    }
-};
-
-template <>
-struct HashWithMixSeed<DB::Int128>
-{
-    static size_t operator()(const DB::Int128 & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_int128(0, v));
-    }
-};
-
-template <>
-struct HashWithMixSeed<DB::UInt128>
-{
-    static inline size_t operator()(const DB::UInt128 & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_uint128(0, v));
-    }
-};
-
-template <>
-struct HashWithMixSeed<DB::Int256>
-{
-    static inline size_t operator()(const DB::Int256 & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_int256(0, v));
-    }
-};
-
-template <>
-struct HashWithMixSeed<DB::UInt256>
-{
-    static inline size_t operator()(const DB::UInt256 & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_uint256(0, v));
-    }
-};
diff --git a/dbms/src/Common/HashTable/StringHashTable.h b/dbms/src/Common/HashTable/StringHashTable.h
index 322523388cc..9bbdabb91fa 100644
--- a/dbms/src/Common/HashTable/StringHashTable.h
+++ b/dbms/src/Common/HashTable/StringHashTable.h
@@ -50,35 +50,6 @@ inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
     return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
 }
 
-inline size_t hash_string_key_24(uint64_t seed, const StringKey24 & v)
-{
-    hash_combine(seed, v.a);
-    hash_combine(seed, v.b);
-    hash_combine(seed, v.c);
-    return seed;
-}
-
-template <>
-struct HashWithMixSeed<StringKey24>
-{
-    static inline size_t operator()(const StringKey24 & v)
-    {
-        return HashWithMixSeedHelper<sizeof(size_t)>::operator()(hash_string_key_24(0, v));
-    }
-};
-
-// struct StringHashTableHash
-// {
-//     using StringKey8Hasher = HashWithMixSeed<StringKey8>;
-//     using StringKey16Hasher = HashWithMixSeed<StringKey16>;
-//     using StringKey24Hasher = HashWithMixSeed<StringKey24>;
-//     using StringRefHasher = StringRefHash;
-// 
-//     static size_t ALWAYS_INLINE operator()(StringKey8 key) { return StringKey8Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringKey16 & key) { return StringKey16Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringKey24 & key) { return StringKey24Hasher::operator()(key); }
-//     static size_t ALWAYS_INLINE operator()(const StringRef & key) { return StringRefHasher::operator()(key); }
-// };
 struct StringHashTableHash
 {
 #if defined(__SSE4_2__)
@@ -117,7 +88,7 @@ struct StringHashTableHash
         return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
     }
 #endif
-    static size_t ALWAYS_INLINE operator()(StringRef key){ return StringRefHash()(key); }
+    static size_t ALWAYS_INLINE operator()(StringRef key) { return StringRefHash()(key); }
 };
 
 template <typename Cell>
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 8e12e7383ab..b714bacce04 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -674,13 +674,6 @@ void NO_INLINE Aggregator::executeImpl(
 #endif
     if (disable_prefetch)
     {
-        // if constexpr (Method::Data::is_string_hash_map)
-        //     executeImplBatchStringHashMap<collect_hit_rate, only_lookup, false>(
-        //         method,
-        //         state,
-        //         aggregates_pool,
-        //         agg_process_info);
-        // else
         executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
     }
     else
@@ -725,14 +718,14 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     try
     {
         if constexpr (only_lookup)
-            return state.template findKey</*enable_prefetch*/true>(
+            return state.template findKey</*enable_prefetch*/ true>(
                 method.data,
                 index,
                 aggregates_pool,
                 sort_key_containers,
                 hashvals);
         else
-            return state.template emplaceKey</*enable_prefetch*/true>(
+            return state.template emplaceKey</*enable_prefetch*/ true>(
                 method.data,
                 index,
                 aggregates_pool,
@@ -878,41 +871,64 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     {
         /// For all rows.
         AggregateDataPtr place = aggregates_pool->alloc(0);
-        std::vector<size_t> hashvals;
+#define HANDLE_AGG_EMPLACE_RESULT                                                                           \
+    if likely (emplace_result_hold.has_value())                                                             \
+    {                                                                                                       \
+        if constexpr (collect_hit_rate)                                                                     \
+        {                                                                                                   \
+            ++agg_process_info.hit_row_cnt;                                                                 \
+        }                                                                                                   \
+                                                                                                            \
+        if constexpr (only_lookup)                                                                          \
+        {                                                                                                   \
+            if (!emplace_result_hold.value().isFound())                                                     \
+                agg_process_info.not_found_rows.push_back(i);                                               \
+        }                                                                                                   \
+        else                                                                                                \
+        {                                                                                                   \
+            emplace_result_hold.value().setMapped(place);                                                   \
+        }                                                                                                   \
+        ++agg_process_info.start_row;                                                                       \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill"); \
+        break;                                                                                              \
+    }
 
         for (size_t i = 0; i < rows; ++i)
         {
-            // TODO prefetch
-            auto emplace_result_hold = emplaceOrFindKey<only_lookup>(
-                method,
-                state,
-                agg_process_info.start_row,
-                *aggregates_pool,
-                sort_key_containers);
-            if likely (emplace_result_hold.has_value())
+            if constexpr (enable_prefetch)
             {
-                if constexpr (collect_hit_rate)
-                {
-                    ++agg_process_info.hit_row_cnt;
-                }
-
-                if constexpr (only_lookup)
-                {
-                    if (!emplace_result_hold.value().isFound())
-                        agg_process_info.not_found_rows.push_back(i);
-                }
-                else
-                {
-                    emplace_result_hold.value().setMapped(place);
-                }
-                ++agg_process_info.start_row;
+                auto hashvals = getHashVals(
+                    agg_process_info.start_row,
+                    agg_process_info.end_row,
+                    method.data,
+                    state,
+                    sort_key_containers,
+                    aggregates_pool);
+
+                auto emplace_result_hold = emplaceOrFindKey<only_lookup>(
+                    method,
+                    state,
+                    agg_process_info.start_row,
+                    *aggregates_pool,
+                    sort_key_containers,
+                    hashvals);
+                HANDLE_AGG_EMPLACE_RESULT
             }
             else
             {
-                LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");
-                break;
+                auto emplace_result_hold = emplaceOrFindKey<only_lookup>(
+                    method,
+                    state,
+                    agg_process_info.start_row,
+                    *aggregates_pool,
+                    sort_key_containers);
+                HANDLE_AGG_EMPLACE_RESULT
             }
         }
+#undef HANDLE_AGG_EMPLACE_RESULT
         return;
     }
 
@@ -953,85 +969,76 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
     std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);
     std::optional<size_t> processed_rows;
 
-#define WRAP_EMPLACE_AGG_KEY_BEGIN                                                          \
-    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i) \
-    {                                                                                       \
+#define HANDLE_AGG_EMPLACE_RESULT                                                                                   \
+    if unlikely (!emplace_result_holder.has_value())                                                                \
+    {                                                                                                               \
+        LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");         \
+        break;                                                                                                      \
+    }                                                                                                               \
+                                                                                                                    \
+    auto & emplace_result = emplace_result_holder.value();                                                          \
+                                                                                                                    \
+    if constexpr (only_lookup)                                                                                      \
+    {                                                                                                               \
+        if (emplace_result.isFound())                                                                               \
+        {                                                                                                           \
+            aggregate_data = emplace_result.getMapped();                                                            \
+        }                                                                                                           \
+        else                                                                                                        \
+        {                                                                                                           \
+            agg_process_info.not_found_rows.push_back(i);                                                           \
+        }                                                                                                           \
+    }                                                                                                               \
+    else                                                                                                            \
+    {                                                                                                               \
+        if (emplace_result.isInserted())                                                                            \
+        {                                                                                                           \
+            emplace_result.setMapped(nullptr);                                                                      \
+                                                                                                                    \
+            aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); \
+            createAggregateStates(aggregate_data);                                                                  \
+                                                                                                                    \
+            emplace_result.setMapped(aggregate_data);                                                               \
+        }                                                                                                           \
+        else                                                                                                        \
+        {                                                                                                           \
+            aggregate_data = emplace_result.getMapped();                                                            \
+                                                                                                                    \
+            if constexpr (collect_hit_rate)                                                                         \
+                ++agg_process_info.hit_row_cnt;                                                                     \
+        }                                                                                                           \
+    }                                                                                                               \
+                                                                                                                    \
+    places[i - agg_process_info.start_row] = aggregate_data;                                                        \
+    processed_rows = i;
+
+    for (size_t i = agg_process_info.start_row; i < agg_process_info.start_row + rows; ++i)
+    {
         AggregateDataPtr aggregate_data = nullptr;
+        if constexpr (enable_prefetch)
+        {
+            auto hashvals = getHashVals(
+                agg_process_info.start_row,
+                agg_process_info.end_row,
+                method.data,
+                state,
+                sort_key_containers,
+                aggregates_pool);
 
-#define WRAP_EMPLACE_AGG_KEY_END                                                                                    \
-        if unlikely (!emplace_result_holder.has_value())                                                                \
-        {                                                                                                               \
-            LOG_INFO(log, "HashTable resize throw ResizeException since the data is already marked for spill");         \
-            break;                                                                                                      \
-        }                                                                                                               \
-                                                                                                                        \
-        auto & emplace_result = emplace_result_holder.value();                                                          \
-                                                                                                                        \
-        if constexpr (only_lookup)                                                                                      \
-        {                                                                                                               \
-            if (emplace_result.isFound())                                                                               \
-            {                                                                                                           \
-                aggregate_data = emplace_result.getMapped();                                                            \
-            }                                                                                                           \
-            else                                                                                                        \
-            {                                                                                                           \
-                agg_process_info.not_found_rows.push_back(i);                                                           \
-            }                                                                                                           \
-        }                                                                                                               \
-        else                                                                                                            \
-        {                                                                                                               \
-            if (emplace_result.isInserted())                                                                            \
-            {                                                                                                           \
-                emplace_result.setMapped(nullptr);                                                                      \
-                                                                                                                        \
-                aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); \
-                createAggregateStates(aggregate_data);                                                                  \
-                                                                                                                        \
-                emplace_result.setMapped(aggregate_data);                                                               \
-            }                                                                                                           \
-            else                                                                                                        \
-            {                                                                                                           \
-                aggregate_data = emplace_result.getMapped();                                                            \
-                                                                                                                        \
-                if constexpr (collect_hit_rate)                                                                         \
-                    ++agg_process_info.hit_row_cnt;                                                                     \
-            }                                                                                                           \
-        }                                                                                                               \
-                                                                                                                        \
-        places[i - agg_process_info.start_row] = aggregate_data;                                                        \
-        processed_rows = i;                                                                                             \
-    }
-
-    if constexpr (enable_prefetch)
-    {
-        std::vector<size_t> hashvals;
-        hashvals = getHashVals(
-            agg_process_info.start_row,
-            agg_process_info.end_row,
-            method.data,
-            state,
-            sort_key_containers,
-            aggregates_pool);
+            auto emplace_result_holder
+                = emplaceOrFindKey<only_lookup>(method, state, i, *aggregates_pool, sort_key_containers, hashvals);
 
-        WRAP_EMPLACE_AGG_KEY_BEGIN
-        auto emplace_result_holder = emplaceOrFindKey<only_lookup>(
-            method,
-            state,
-            i,
-            *aggregates_pool,
-            sort_key_containers,
-            hashvals);
-        WRAP_EMPLACE_AGG_KEY_END
-    }
-    else
-    {
-        WRAP_EMPLACE_AGG_KEY_BEGIN
-        auto emplace_result_holder
-            = emplaceOrFindKey<only_lookup>(method, state, i, *aggregates_pool, sort_key_containers);
-        WRAP_EMPLACE_AGG_KEY_END
+            HANDLE_AGG_EMPLACE_RESULT
+        }
+        else
+        {
+            auto emplace_result_holder
+                = emplaceOrFindKey<only_lookup>(method, state, i, *aggregates_pool, sort_key_containers);
+
+            HANDLE_AGG_EMPLACE_RESULT
+        }
     }
-#undef WRAP_EMPLACE_AGG_KEY_BEGIN
-#undef WRAP_EMPLACE_AGG_KEY_END
+#undef HANDLE_AGG_EMPLACE_RESULT
 
     if (processed_rows)
     {

From 352b710bdebcdc803b2baca3001cc2b32ca4a85f Mon Sep 17 00:00:00 2001
From: guo-shaoge <shaoge1994@163.com>
Date: Thu, 5 Dec 2024 11:51:24 +0800
Subject: [PATCH 24/24] fix case

Signed-off-by: guo-shaoge <shaoge1994@163.com>
---
 .../Flash/tests/gtest_spill_aggregation.cpp   | 46 +++++-----
 dbms/src/Interpreters/Aggregator.cpp          | 84 ++++++++++++-------
 dbms/src/Interpreters/Aggregator.h            |  4 +-
 3 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/dbms/src/Flash/tests/gtest_spill_aggregation.cpp b/dbms/src/Flash/tests/gtest_spill_aggregation.cpp
index b19aaf03c4c..583e6e038fa 100644
--- a/dbms/src/Flash/tests/gtest_spill_aggregation.cpp
+++ b/dbms/src/Flash/tests/gtest_spill_aggregation.cpp
@@ -23,6 +23,7 @@ namespace FailPoints
 {
 extern const char force_agg_on_partial_block[];
 extern const char force_thread_0_no_agg_spill[];
+extern const char force_agg_prefetch[];
 } // namespace FailPoints
 
 namespace tests
@@ -37,16 +38,22 @@ class SpillAggregationTestRunner : public DB::tests::ExecutorTest
     }
 };
 
-#define WRAP_FOR_AGG_PARTIAL_BLOCK_START                                              \
-    std::vector<bool> partial_blocks{true, false};                                    \
-    for (auto partial_block : partial_blocks)                                         \
-    {                                                                                 \
-        if (partial_block)                                                            \
-            FailPointHelper::enableFailPoint(FailPoints::force_agg_on_partial_block); \
-        else                                                                          \
-            FailPointHelper::disableFailPoint(FailPoints::force_agg_on_partial_block);
+#define WRAP_FOR_AGG_FAILPOINTS_START                                                  \
+    std::vector<bool> enables{true, false};                                            \
+    for (auto enable : enables)                                                        \
+    {                                                                                  \
+        if (enable)                                                                    \
+        {                                                                              \
+            FailPointHelper::enableFailPoint(FailPoints::force_agg_on_partial_block);  \
+            FailPointHelper::enableFailPoint(FailPoints::force_agg_prefetch);          \
+        }                                                                              \
+        else                                                                           \
+        {                                                                              \
+            FailPointHelper::disableFailPoint(FailPoints::force_agg_on_partial_block); \
+            FailPointHelper::disableFailPoint(FailPoints::force_agg_prefetch);         \
+        }
 
-#define WRAP_FOR_AGG_PARTIAL_BLOCK_END }
+#define WRAP_FOR_AGG_FAILPOINTS_END }
 
 #define WRAP_FOR_AGG_THREAD_0_NO_SPILL_START                                           \
     for (auto thread_0_no_spill : {true, false})                                       \
@@ -114,13 +121,13 @@ try
     context.context->setSetting("group_by_two_level_threshold_bytes", Field(static_cast<UInt64>(1)));
     /// don't use `executeAndAssertColumnsEqual` since it takes too long to run
     /// test single thread aggregation
-    WRAP_FOR_AGG_PARTIAL_BLOCK_START
+    WRAP_FOR_AGG_FAILPOINTS_START
     WRAP_FOR_AGG_THREAD_0_NO_SPILL_START
     ASSERT_COLUMNS_EQ_UR(ref_columns, executeStreams(request, 1));
     /// test parallel aggregation
     ASSERT_COLUMNS_EQ_UR(ref_columns, executeStreams(request, original_max_streams));
     WRAP_FOR_AGG_THREAD_0_NO_SPILL_END
-    WRAP_FOR_AGG_PARTIAL_BLOCK_END
+    WRAP_FOR_AGG_FAILPOINTS_END
     /// enable spill and use small max_cached_data_bytes_in_spiller
     context.context->setSetting("max_cached_data_bytes_in_spiller", Field(static_cast<UInt64>(total_data_size / 200)));
     /// test single thread aggregation
@@ -262,7 +269,7 @@ try
                         Field(static_cast<UInt64>(max_bytes_before_external_agg)));
                     context.context->setSetting("max_block_size", Field(static_cast<UInt64>(max_block_size)));
                     WRAP_FOR_SPILL_TEST_BEGIN
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                    WRAP_FOR_AGG_FAILPOINTS_START
                     WRAP_FOR_AGG_THREAD_0_NO_SPILL_START
                     auto blocks = getExecuteStreamsReturnBlocks(request, concurrency);
                     for (auto & block : blocks)
@@ -289,7 +296,7 @@ try
                             false));
                     }
                     WRAP_FOR_AGG_THREAD_0_NO_SPILL_END
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                    WRAP_FOR_AGG_FAILPOINTS_END
                     WRAP_FOR_SPILL_TEST_END
                 }
             }
@@ -369,6 +376,7 @@ try
         {
             for (const auto & agg_func : agg_funcs)
             {
+                FailPointHelper::disableFailPoint(FailPoints::force_agg_prefetch);
                 context.setCollation(collator_id);
                 const auto * current_collator = TiDB::ITiDBCollator::getCollator(collator_id);
                 ASSERT_TRUE(current_collator != nullptr);
@@ -417,7 +425,7 @@ try
                         Field(static_cast<UInt64>(max_bytes_before_external_agg)));
                     context.context->setSetting("max_block_size", Field(static_cast<UInt64>(max_block_size)));
                     WRAP_FOR_SPILL_TEST_BEGIN
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_START
+                    WRAP_FOR_AGG_FAILPOINTS_START
                     WRAP_FOR_AGG_THREAD_0_NO_SPILL_START
                     auto blocks = getExecuteStreamsReturnBlocks(request, concurrency);
                     for (auto & block : blocks)
@@ -444,7 +452,7 @@ try
                             false));
                     }
                     WRAP_FOR_AGG_THREAD_0_NO_SPILL_END
-                    WRAP_FOR_AGG_PARTIAL_BLOCK_END
+                    WRAP_FOR_AGG_FAILPOINTS_END
                     WRAP_FOR_SPILL_TEST_END
                 }
             }
@@ -518,9 +526,9 @@ try
         /// don't use `executeAndAssertColumnsEqual` since it takes too long to run
         auto request = gen_request(exchange_concurrency);
         WRAP_FOR_SPILL_TEST_BEGIN
-        WRAP_FOR_AGG_PARTIAL_BLOCK_START
+        WRAP_FOR_AGG_FAILPOINTS_START
         ASSERT_COLUMNS_EQ_UR(baseline, executeStreams(request, exchange_concurrency));
-        WRAP_FOR_AGG_PARTIAL_BLOCK_END
+        WRAP_FOR_AGG_FAILPOINTS_END
         WRAP_FOR_SPILL_TEST_END
     }
 }
@@ -528,8 +536,8 @@ CATCH
 
 #undef WRAP_FOR_SPILL_TEST_BEGIN
 #undef WRAP_FOR_SPILL_TEST_END
-#undef WRAP_FOR_AGG_PARTIAL_BLOCK_START
-#undef WRAP_FOR_AGG_PARTIAL_BLOCK_END
+#undef WRAP_FOR_AGG_FAILPOINTS_START
+#undef WRAP_FOR_AGG_FAILPOINTS_END
 
 } // namespace tests
 } // namespace DB
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index b714bacce04..537d13d1441 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -666,26 +666,38 @@ void NO_INLINE Aggregator::executeImpl(
 {
     typename Method::State state(agg_process_info.key_columns, key_sizes, collators);
 
+    // start_row!=0 and stringHashTableRecoveryInfo not empty and cannot be true at the same time.
+    RUNTIME_CHECK(!(agg_process_info.start_row != 0 && !agg_process_info.stringHashTableRecoveryInfoEmpty()));
+
 #ifndef NDEBUG
     bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192);
     fiu_do_on(FailPoints::force_agg_prefetch, { disable_prefetch = false; });
 #else
     const bool disable_prefetch = (method.data.getBufferSizeInCells() < 8192);
 #endif
-    if (disable_prefetch)
-    {
-        executeImplBatch<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
-    }
-    else
+
+    if constexpr (Method::Data::is_string_hash_map)
     {
-        if constexpr (Method::Data::is_string_hash_map)
-            executeImplBatchStringHashMap<collect_hit_rate, only_lookup, true>(
+        // When will handled by column-wise(executeImplStringHashMapByCol):
+        // 1. For StringHashMap, which is composed by 5 submaps, needs be handled by column-wise when prefetch is enabled.
+        // 2. If agg_process_info.start_row != 0, it means the computation process of the current block was interrupted by resize exception in executeImplByRow.
+        //    For clarity and simplicity of implementation, the processing functions for column-wise and row-wise methods handle the entire block independently.
+        //    A block will not be processed first by the row-wise method and then by the column-wise method, or vice-versa.
+        if (!disable_prefetch && likely(agg_process_info.start_row == 0))
+            executeImplStringHashMapByCol<collect_hit_rate, only_lookup, true>(
                 method,
                 state,
                 aggregates_pool,
                 agg_process_info);
         else
-            executeImplBatch<collect_hit_rate, only_lookup, true>(method, state, aggregates_pool, agg_process_info);
+            executeImplByRow<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+    }
+    else
+    {
+        if (disable_prefetch)
+            executeImplByRow<collect_hit_rate, only_lookup, false>(method, state, aggregates_pool, agg_process_info);
+        else
+            executeImplByRow<collect_hit_rate, only_lookup, true>(method, state, aggregates_pool, agg_process_info);
     }
 }
 
@@ -759,7 +771,7 @@ std::optional<typename Method::template EmplaceOrFindKeyResult<only_lookup>::Res
     }
 }
 
-// This is only used by executeImplBatchStringHashMap.
+// This is only used by executeImplStringHashMapByCol.
 // It will choose specifix submap of StringHashMap then do emplace/find.
 // StringKeyType can be StringRef/StringKey8/StringKey16/StringKey24/ArenaKeyHolder.
 template <
@@ -849,7 +861,7 @@ size_t Aggregator::emplaceOrFindStringKey(
 }
 
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
-ALWAYS_INLINE void Aggregator::executeImplBatch(
+ALWAYS_INLINE void Aggregator::executeImplByRow(
     Method & method,
     typename Method::State & state,
     Arena * aggregates_pool,
@@ -857,6 +869,11 @@ ALWAYS_INLINE void Aggregator::executeImplBatch(
 {
     // collect_hit_rate and only_lookup cannot be true at the same time.
     static_assert(!(collect_hit_rate && only_lookup));
+    // If agg_process_info.stringHashTableRecoveryInfoEmpty() is false, it means the current block was
+    // handled by executeImplStringHashMapByCol(column-wise) before, and resize execption happened.
+    // This situation is unexpected because for the sake of clarity, we assume that a block will be **fully** processed
+    // either column-wise or row-wise and cannot be split for processing.
+    RUNTIME_CHECK(agg_process_info.stringHashTableRecoveryInfoEmpty());
 
     std::vector<std::string> sort_key_containers;
     sort_key_containers.resize(params.keys_size, "");
@@ -1086,7 +1103,7 @@ M(4)
 // NOTE: this function is column-wise, which means sort key buffer cannot be reused.
 // This buffer will not be release until this block is processed done.
 template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
-ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
+ALWAYS_INLINE void Aggregator::executeImplStringHashMapByCol(
     Method & method,
     typename Method::State & state,
     Arena * aggregates_pool,
@@ -1125,7 +1142,11 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
 
     // If no resize exception happens, so this is a new Block.
     // If resize exception happens, start_row has already been set to zero at the end of this function.
-    RUNTIME_CHECK(agg_process_info.start_row == 0);
+    RUNTIME_CHECK_MSG(
+        agg_process_info.start_row == 0,
+        "unexpected agg_process_info.start_row: {}, end_row: {}",
+        agg_process_info.start_row,
+        agg_process_info.end_row);
 
     if likely (agg_process_info.stringHashTableRecoveryInfoEmpty())
     {
@@ -1233,10 +1254,9 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
     M(4, key_str_infos, key_str_datas, key_str_places)
 #undef M
 
-    if (zero_agg_func_size)
-        return;
-
-    std::vector<AggregateDataPtr> places(rows, nullptr);
+    if (!zero_agg_func_size)
+    {
+        std::vector<AggregateDataPtr> places(rows, nullptr);
 #define M(INFO, PLACES)                        \
     for (size_t i = 0; i < (INFO).size(); ++i) \
     {                                          \
@@ -1244,24 +1264,26 @@ ALWAYS_INLINE void Aggregator::executeImplBatchStringHashMap(
         places[row] = (PLACES)[i];             \
     }
 
-    M(key0_infos, key0_places)
-    M(key8_infos, key8_places)
-    M(key16_infos, key16_places)
-    M(key24_infos, key24_places)
-    M(key_str_infos, key_str_places)
+        M(key0_infos, key0_places)
+        M(key8_infos, key8_places)
+        M(key16_infos, key16_places)
+        M(key24_infos, key24_places)
+        M(key_str_infos, key_str_places)
 #undef M
 
-    for (AggregateFunctionInstruction * inst = agg_process_info.aggregate_functions_instructions.data(); inst->that;
-         ++inst)
-    {
-        inst->batch_that->addBatch(
-            agg_process_info.start_row,
-            rows,
-            &places[0],
-            inst->state_offset,
-            inst->batch_arguments,
-            aggregates_pool);
+        for (AggregateFunctionInstruction * inst = agg_process_info.aggregate_functions_instructions.data(); inst->that;
+             ++inst)
+        {
+            inst->batch_that->addBatch(
+                agg_process_info.start_row,
+                rows,
+                &places[0],
+                inst->state_offset,
+                inst->batch_arguments,
+                aggregates_pool);
+        }
     }
+
     if unlikely (got_resize_exception)
     {
         RUNTIME_CHECK(!agg_process_info.stringHashTableRecoveryInfoEmpty());
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 81252b8b3c6..d88a97278f9 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -1477,14 +1477,14 @@ class Aggregator
         TiDB::TiDBCollators & collators) const;
 
     template <bool collect_hit_rate, bool only_loopup, bool enable_prefetch, typename Method>
-    void executeImplBatch(
+    void executeImplByRow(
         Method & method,
         typename Method::State & state,
         Arena * aggregates_pool,
         AggProcessInfo & agg_process_info) const;
 
     template <bool collect_hit_rate, bool only_lookup, bool enable_prefetch, typename Method>
-    void executeImplBatchStringHashMap(
+    void executeImplStringHashMapByCol(
         Method & method,
         typename Method::State & state,
         Arena * aggregates_pool,