From 85f39732b4928a0d67aad3503d1ab83f9b92d985 Mon Sep 17 00:00:00 2001
From: "hengjiang.ly" <hengjiang.ly@alibaba-inc.com>
Date: Tue, 12 Mar 2024 19:50:15 -0700
Subject: [PATCH] Add prefix-sort with support for fixed width sorting keys
 (#8146)

Summary:
PrefixSort is used to improve in-memory sort performance, using memcmp to compare binary string (normalized encoded sort keys called 'prefix') when sorting. This PR adds 'extract rows to prefix'  and 'sort' for fixed width types (int32, int64, float, double, timestamp etc.). More types will be added in a follow-up.

Add benchmark to compare std::sort vs. prefix sort using 1, 2, 3, and 4 bigint sorting keys.

The performance of sorting up to 1000 rows is the same. When sorting more than 1K rows prefix-sort is faster, the gains increase as the number of rows sorted and the number of sorting keys increases. The presence of the payload columns doesn't affect the performance.

```
============================================================================
[...]ec/benchmarks/PrefixSortBenchmark.cpp     relative  time/iter   iters/s
============================================================================
StdSort_no-payload_1_bigint_0.01k                          49.40ns    20.24M
PrefixSort                                      100.00%    49.40ns    20.24M
StdSort_no-payload_2_bigint_0.01k                          87.85ns    11.38M
PrefixSort                                      99.904%    87.94ns    11.37M
StdSort_no-payload_3_bigint_0.01k                          65.04ns    15.37M
PrefixSort                                      99.350%    65.47ns    15.27M
StdSort_no-payload_4_bigint_0.01k                          69.19ns    14.45M
PrefixSort                                      99.566%    69.49ns    14.39M
StdSort_no-payload_1_bigint_0.015k                         47.93ns    20.86M
PrefixSort                                      100.02%    47.92ns    20.87M
StdSort_no-payload_2_bigint_0.015k                         54.22ns    18.44M
PrefixSort                                      99.921%    54.26ns    18.43M
StdSort_no-payload_3_bigint_0.015k                         61.00ns    16.39M
PrefixSort                                      99.958%    61.02ns    16.39M
StdSort_no-payload_4_bigint_0.015k                         57.38ns    17.43M
PrefixSort                                      99.870%    57.46ns    17.40M
StdSort_no-payload_1_bigint_0.02k                          47.82ns    20.91M
PrefixSort                                      99.914%    47.86ns    20.90M
StdSort_no-payload_2_bigint_0.02k                          83.94ns    11.91M
PrefixSort                                      100.00%    83.93ns    11.91M
StdSort_no-payload_3_bigint_0.02k                         128.43ns     7.79M
PrefixSort                                      100.17%   128.21ns     7.80M
StdSort_no-payload_4_bigint_0.02k                         165.29ns     6.05M
PrefixSort                                      99.997%   165.30ns     6.05M
StdSort_no-payload_1_bigint_0.05k                          77.34ns    12.93M
PrefixSort                                      99.425%    77.79ns    12.86M
StdSort_no-payload_2_bigint_0.05k                         113.79ns     8.79M
PrefixSort                                      99.786%   114.03ns     8.77M
StdSort_no-payload_3_bigint_0.05k                         152.81ns     6.54M
PrefixSort                                      99.696%   153.27ns     6.52M
StdSort_no-payload_4_bigint_0.05k                         185.93ns     5.38M
PrefixSort                                      100.02%   185.90ns     5.38M
StdSort_no-payload_1_bigint_0.1k                           87.39ns    11.44M
PrefixSort                                      99.499%    87.83ns    11.39M
StdSort_no-payload_2_bigint_0.1k                          139.93ns     7.15M
PrefixSort                                      162.24%    86.25ns    11.59M
StdSort_no-payload_3_bigint_0.1k                          186.27ns     5.37M
PrefixSort                                      186.72%    99.76ns    10.02M
StdSort_no-payload_4_bigint_0.1k                          234.01ns     4.27M
PrefixSort                                      187.97%   124.49ns     8.03M
StdSort_no-payloads_1_bigint_1k                           173.31ns     5.77M
PrefixSort                                      136.72%   126.76ns     7.89M
StdSort_no-payloads_2_bigint_1k                           249.77ns     4.00M
PrefixSort                                      199.49%   125.20ns     7.99M
StdSort_no-payloads_3_bigint_1k                           314.18ns     3.18M
PrefixSort                                      219.49%   143.14ns     6.99M
StdSort_no-payloads_4_bigint_1k                           348.38ns     2.87M
PrefixSort                                      203.28%   171.38ns     5.84M
StdSort_no-payloads_1_bigint_10k                          251.90ns     3.97M
PrefixSort                                      165.99%   151.76ns     6.59M
StdSort_no-payloads_2_bigint_10k                          363.09ns     2.75M
PrefixSort                                      253.07%   143.47ns     6.97M
StdSort_no-payloads_3_bigint_10k                          483.58ns     2.07M
PrefixSort                                      293.67%   164.67ns     6.07M
StdSort_no-payloads_4_bigint_10k                          593.29ns     1.69M
PrefixSort                                      312.83%   189.65ns     5.27M
StdSort_no-payloads_1_bigint_100k                         330.44ns     3.03M
PrefixSort                                      192.57%   171.59ns     5.83M
StdSort_no-payloads_2_bigint_100k                         470.79ns     2.12M
PrefixSort                                      293.67%   160.31ns     6.24M
StdSort_no-payloads_3_bigint_100k                         607.15ns     1.65M
PrefixSort                                      303.88%   199.80ns     5.01M
StdSort_no-payloads_4_bigint_100k                         706.03ns     1.42M
PrefixSort                                      315.15%   224.03ns     4.46M
StdSort_no-payloads_1_bigint_1000k                        452.05ns     2.21M
PrefixSort                                      204.92%   220.60ns     4.53M
StdSort_no-payloads_2_bigint_1000k                        645.35ns     1.55M
PrefixSort                                      306.42%   210.61ns     4.75M
StdSort_no-payloads_3_bigint_1000k                        818.78ns     1.22M
PrefixSort                                      328.01%   249.62ns     4.01M
StdSort_no-payloads_4_bigint_1000k                        981.65ns     1.02M
PrefixSort                                      343.79%   285.54ns     3.50M
StdSort_2payloads_1_bigint_1k                             177.21ns     5.64M
PrefixSort                                      139.94%   126.63ns     7.90M
StdSort_2payloads_2_bigint_1k                             248.46ns     4.02M
PrefixSort                                      199.08%   124.80ns     8.01M
StdSort_2payloads_3_bigint_1k                             313.66ns     3.19M
PrefixSort                                      218.48%   143.56ns     6.97M
StdSort_2payloads_4_bigint_1k                             359.17ns     2.78M
PrefixSort                                      208.57%   172.21ns     5.81M
StdSort_2payloads_1_bigint_10k                            254.83ns     3.92M
PrefixSort                                      168.28%   151.43ns     6.60M
StdSort_2payloads_2_bigint_10k                            363.92ns     2.75M
PrefixSort                                      254.35%   143.08ns     6.99M
StdSort_2payloads_3_bigint_10k                            475.61ns     2.10M
PrefixSort                                      288.96%   164.60ns     6.08M
StdSort_2payloads_4_bigint_10k                            594.78ns     1.68M
PrefixSort                                      314.12%   189.35ns     5.28M
StdSort_2payloads_1_bigint_100k                           349.99ns     2.86M
PrefixSort                                      205.28%   170.49ns     5.87M
StdSort_2payloads_2_bigint_100k                           489.03ns     2.04M
PrefixSort                                      307.77%   158.89ns     6.29M
StdSort_2payloads_3_bigint_100k                           607.88ns     1.65M
PrefixSort                                      305.62%   198.90ns     5.03M
StdSort_2payloads_4_bigint_100k                           715.90ns     1.40M
PrefixSort                                      321.73%   222.52ns     4.49M
StdSort_2payloads_1_bigint_1000k                          574.14ns     1.74M
PrefixSort                                      262.05%   219.10ns     4.56M
StdSort_2payloads_2_bigint_1000k                          796.05ns     1.26M
PrefixSort                                      377.73%   210.75ns     4.75M
StdSort_2payloads_3_bigint_1000k                            1.02us   975.65K
PrefixSort                                      411.02%   249.37ns     4.01M
StdSort_2payloads_4_bigint_1000k                            1.16us   858.48K
PrefixSort                                      408.37%   285.24ns     3.51M
StdSort_2payloads_1_bigint_0.01k                           49.37ns    20.26M
PrefixSort                                      99.124%    49.81ns    20.08M
StdSort_2payloads_2_bigint_0.01k                           89.08ns    11.23M
PrefixSort                                      99.958%    89.12ns    11.22M
StdSort_2payloads_3_bigint_0.01k                           64.40ns    15.53M
PrefixSort                                      99.991%    64.41ns    15.53M
StdSort_2payloads_4_bigint_0.01k                           86.56ns    11.55M
PrefixSort                                      100.34%    86.26ns    11.59M
StdSort_2payloads_1_bigint_0.015k                          48.17ns    20.76M
PrefixSort                                      100.11%    48.12ns    20.78M
StdSort_2payloads_2_bigint_0.015k                          55.44ns    18.04M
PrefixSort                                      99.994%    55.45ns    18.03M
StdSort_2payloads_3_bigint_0.015k                          61.17ns    16.35M
PrefixSort                                      99.988%    61.18ns    16.35M
StdSort_2payloads_4_bigint_0.015k                          57.55ns    17.38M
PrefixSort                                      99.895%    57.61ns    17.36M
StdSort_2payloads_1_bigint_0.02k                           47.93ns    20.86M
PrefixSort                                      99.916%    47.97ns    20.85M
StdSort_2payloads_2_bigint_0.02k                           84.10ns    11.89M
PrefixSort                                      100.38%    83.78ns    11.94M
StdSort_2payloads_3_bigint_0.02k                          126.53ns     7.90M
PrefixSort                                      100.05%   126.47ns     7.91M
StdSort_2payloads_4_bigint_0.02k                          164.44ns     6.08M
PrefixSort                                      99.935%   164.55ns     6.08M
StdSort_2payloads_1_bigint_0.05k                           77.86ns    12.84M
PrefixSort                                      99.171%    78.51ns    12.74M
StdSort_2payloads_2_bigint_0.05k                          118.10ns     8.47M
PrefixSort                                      100.53%   117.48ns     8.51M
StdSort_2payloads_3_bigint_0.05k                          152.74ns     6.55M
PrefixSort                                      100.02%   152.71ns     6.55M
StdSort_2payloads_4_bigint_0.05k                          184.56ns     5.42M
PrefixSort                                      99.925%   184.70ns     5.41M
StdSort_2payloads_1_bigint_0.1k                            88.01ns    11.36M
PrefixSort                                      100.46%    87.60ns    11.42M
StdSort_2payloads_2_bigint_0.1k                           138.22ns     7.24M
PrefixSort                                      159.92%    86.43ns    11.57M
StdSort_2payloads_3_bigint_0.1k                           187.49ns     5.33M
PrefixSort                                      187.96%    99.75ns    10.03M
StdSort_2payloads_4_bigint_0.1k                           232.52ns     4.30M
PrefixSort                                      188.98%   123.04ns     8.13M
StdSort_no-payloads_1_varchar_1k                          292.32ns     3.42M
PrefixSort                                      100.47%   290.96ns     3.44M
StdSort_no-payloads_2_varchar_1k                          380.98ns     2.62M
PrefixSort                                      98.623%   386.29ns     2.59M
StdSort_no-payloads_3_varchar_1k                          456.15ns     2.19M
PrefixSort                                      98.302%   464.03ns     2.16M
StdSort_no-payloads_4_varchar_1k                          520.84ns     1.92M
PrefixSort                                      98.186%   530.46ns     1.89M
StdSort_no-payloads_1_varchar_10k                         422.83ns     2.37M
PrefixSort                                      99.186%   426.30ns     2.35M
StdSort_no-payloads_2_varchar_10k                         495.10ns     2.02M
PrefixSort                                      98.218%   504.08ns     1.98M
StdSort_no-payloads_3_varchar_10k                         584.89ns     1.71M
PrefixSort                                      99.079%   590.33ns     1.69M
StdSort_no-payloads_4_varchar_10k                         667.37ns     1.50M
PrefixSort                                      98.887%   674.88ns     1.48M
StdSort_no-payloads_1_varchar_100k                        605.27ns     1.65M
PrefixSort                                      99.425%   608.78ns     1.64M
StdSort_no-payloads_2_varchar_100k                        741.11ns     1.35M
PrefixSort                                      99.107%   747.78ns     1.34M
StdSort_no-payloads_3_varchar_100k                        890.60ns     1.12M
PrefixSort                                      99.089%   898.78ns     1.11M
StdSort_no-payloads_4_varchar_100k                          1.11us   903.14K
PrefixSort                                      104.50%     1.06us   943.76K
StdSort_no-payloads_1_varchar_1000k                         1.22us   822.83K
PrefixSort                                      99.534%     1.22us   818.99K
StdSort_no-payloads_2_varchar_1000k                         1.52us   656.78K
PrefixSort                                      99.353%     1.53us   652.53K
StdSort_no-payloads_3_varchar_1000k                         1.78us   560.23K
PrefixSort                                      98.862%     1.81us   553.86K
StdSort_no-payloads_4_varchar_1000k                         1.93us   519.34K
PrefixSort                                      99.159%     1.94us   514.97K
```

Part of https://github.com/facebookincubator/velox/issues/6766

Pull Request resolved: https://github.com/facebookincubator/velox/pull/8146

Reviewed By: Yuhta

Differential Revision: D54247347

Pulled By: mbasmanova

fbshipit-source-id: b358925e6b702de5bb2eee55df24827b51268923
---
 velox/exec/CMakeLists.txt                     |   1 +
 velox/exec/PrefixSort.cpp                     | 255 ++++++++++++++
 velox/exec/PrefixSort.h                       | 179 ++++++++++
 velox/exec/benchmarks/CMakeLists.txt          |   5 +
 velox/exec/benchmarks/PrefixSortBenchmark.cpp | 324 ++++++++++++++++++
 velox/exec/prefixsort/PrefixSortEncoder.h     |  26 ++
 velox/exec/tests/CMakeLists.txt               |   3 +-
 velox/exec/tests/PrefixSortTest.cpp           | 271 +++++++++++++++
 8 files changed, 1063 insertions(+), 1 deletion(-)
 create mode 100644 velox/exec/PrefixSort.cpp
 create mode 100644 velox/exec/PrefixSort.h
 create mode 100644 velox/exec/benchmarks/PrefixSortBenchmark.cpp
 create mode 100644 velox/exec/tests/PrefixSortTest.cpp

diff --git a/velox/exec/CMakeLists.txt b/velox/exec/CMakeLists.txt
index 99f789b1785d..de522d149f0f 100644
--- a/velox/exec/CMakeLists.txt
+++ b/velox/exec/CMakeLists.txt
@@ -53,6 +53,7 @@ add_library(
   NestedLoopJoinProbe.cpp
   Operator.cpp
   OperatorUtils.cpp
+  PrefixSort.cpp
   OrderBy.cpp
   PartitionedOutput.cpp
   OutputBuffer.cpp
diff --git a/velox/exec/PrefixSort.cpp b/velox/exec/PrefixSort.cpp
new file mode 100644
index 000000000000..c5ea46a9bf9f
--- /dev/null
+++ b/velox/exec/PrefixSort.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "velox/exec/PrefixSort.h"
+
+using namespace facebook::velox::exec::prefixsort;
+
+namespace facebook::velox::exec {
+
+namespace {
+
+// For alignment, 8 is faster than 4.
+// If the alignment is changed from 8 to 4, you need to change bitswap64
+// to bitswap32.
+const int32_t kAlignment = 8;
+
+template <typename T>
+FOLLY_ALWAYS_INLINE void encodeRowColumn(
+    const PrefixSortLayout& prefixSortLayout,
+    const uint32_t index,
+    const RowColumn& rowColumn,
+    char* const row,
+    char* const prefix) {
+  std::optional<T> value;
+  if (RowContainer::isNullAt(row, rowColumn.nullByte(), rowColumn.nullMask())) {
+    value = std::nullopt;
+  } else {
+    value = *(reinterpret_cast<T*>(row + rowColumn.offset()));
+  }
+  prefixSortLayout.encoders[index].encode(
+      value, prefix + prefixSortLayout.prefixOffsets[index]);
+}
+
+FOLLY_ALWAYS_INLINE void extractRowColumnToPrefix(
+    TypeKind typeKind,
+    const PrefixSortLayout& prefixSortLayout,
+    const uint32_t index,
+    const RowColumn& rowColumn,
+    char* const row,
+    char* const prefix) {
+  switch (typeKind) {
+    case TypeKind::INTEGER: {
+      encodeRowColumn<int32_t>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
+    case TypeKind::BIGINT: {
+      encodeRowColumn<int64_t>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
+    case TypeKind::REAL: {
+      encodeRowColumn<float>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
+    case TypeKind::DOUBLE: {
+      encodeRowColumn<double>(prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
+    case TypeKind::TIMESTAMP: {
+      encodeRowColumn<Timestamp>(
+          prefixSortLayout, index, rowColumn, row, prefix);
+      return;
+    }
+    default:
+      VELOX_UNSUPPORTED(
+          "prefix-sort does not support type kind: {}",
+          mapTypeKindToName(typeKind));
+  }
+}
+
+FOLLY_ALWAYS_INLINE int32_t alignmentPadding(int32_t size, int32_t alignment) {
+  auto extra = size % alignment;
+  return extra == 0 ? 0 : alignment - extra;
+}
+
+FOLLY_ALWAYS_INLINE void bitsSwapByWord(uint64_t* address, int32_t bytes) {
+  while (bytes != 0) {
+    *address = __builtin_bswap64(*address);
+    ++address;
+    bytes -= kAlignment;
+  }
+}
+
+FOLLY_ALWAYS_INLINE int
+compareByWord(uint64_t* left, uint64_t* right, int32_t bytes) {
+  while (bytes != 0) {
+    if (*left == *right) {
+      ++left;
+      ++right;
+      bytes -= kAlignment;
+      continue;
+    }
+    if (*left > *right) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  return 0;
+}
+
+} // namespace
+
+PrefixSortLayout PrefixSortLayout::makeSortLayout(
+    const std::vector<TypePtr>& types,
+    const std::vector<CompareFlags>& compareFlags,
+    uint32_t maxNormalizedKeySize) {
+  uint32_t normalizedKeySize = 0;
+  uint32_t numNormalizedKeys = 0;
+  const uint32_t numKeys = types.size();
+  std::vector<uint32_t> prefixOffsets;
+  std::vector<PrefixSortEncoder> encoders;
+
+  // Calculate encoders and prefix-offsets, and stop the loop if a key that
+  // cannot be normalized is encountered.
+  for (auto i = 0; i < numKeys; ++i) {
+    if (normalizedKeySize > maxNormalizedKeySize) {
+      break;
+    }
+    std::optional<uint32_t> encodedSize =
+        PrefixSortEncoder::encodedSize(types[i]->kind());
+    if (encodedSize.has_value()) {
+      prefixOffsets.push_back(normalizedKeySize);
+      encoders.push_back(
+          {compareFlags[i].ascending, compareFlags[i].nullsFirst});
+      normalizedKeySize += encodedSize.value();
+      numNormalizedKeys++;
+    } else {
+      break;
+    }
+  }
+  auto padding = alignmentPadding(normalizedKeySize, kAlignment);
+  normalizedKeySize += padding;
+  return PrefixSortLayout{
+      normalizedKeySize + sizeof(char*),
+      normalizedKeySize,
+      numNormalizedKeys,
+      numKeys,
+      compareFlags,
+      numNormalizedKeys == 0,
+      numNormalizedKeys < numKeys,
+      std::move(prefixOffsets),
+      std::move(encoders),
+      padding};
+}
+
+FOLLY_ALWAYS_INLINE int PrefixSort::compareAllNormalizedKeys(
+    char* left,
+    char* right) {
+  return compareByWord(
+      (uint64_t*)left, (uint64_t*)right, sortLayout_.normalizedBufferSize);
+}
+
+int PrefixSort::comparePartNormalizedKeys(char* left, char* right) {
+  int result = compareAllNormalizedKeys(left, right);
+  if (result != 0) {
+    return result;
+  }
+  // If prefixes are equal, compare the left sort keys with rowContainer.
+  char* leftAddress = getAddressFromPrefix(left);
+  char* rightAddress = getAddressFromPrefix(right);
+  for (auto i = sortLayout_.numNormalizedKeys; i < sortLayout_.numKeys; ++i) {
+    result = rowContainer_->compare(
+        leftAddress, rightAddress, i, sortLayout_.compareFlags[i]);
+    if (result != 0) {
+      return result;
+    }
+  }
+  return result;
+}
+
+PrefixSort::PrefixSort(
+    memory::MemoryPool* pool,
+    RowContainer* rowContainer,
+    const std::vector<CompareFlags>& keyCompareFlags,
+    const PrefixSortConfig& config,
+    const PrefixSortLayout& sortLayout)
+    : pool_(pool), sortLayout_(sortLayout), rowContainer_(rowContainer) {}
+
+void PrefixSort::extractRowToPrefix(char* row, char* prefix) {
+  for (auto i = 0; i < sortLayout_.numNormalizedKeys; i++) {
+    extractRowColumnToPrefix(
+        rowContainer_->keyTypes()[i]->kind(),
+        sortLayout_,
+        i,
+        rowContainer_->columnAt(i),
+        row,
+        prefix);
+  }
+  simd::memset(
+      prefix + sortLayout_.normalizedBufferSize - sortLayout_.padding,
+      0,
+      sortLayout_.padding);
+  // When comparing in std::memcmp, each byte is compared. If it is changed to
+  // compare every 8 bytes, the number of comparisons will be reduced and the
+  // performance will be improved.
+  // Use uint64_t compare to implement the above-mentioned comparison of every 8
+  // bytes, assuming the system is little-endian, need to reverse bytes for
+  // every 8 bytes.
+  bitsSwapByWord((uint64_t*)prefix, sortLayout_.normalizedBufferSize);
+  // Set row address.
+  getAddressFromPrefix(prefix) = row;
+}
+
+void PrefixSort::sortInternal(std::vector<char*>& rows) {
+  const auto numRows = rows.size();
+  const auto entrySize = sortLayout_.entrySize;
+  memory::ContiguousAllocation prefixAllocation;
+  // 1. Allocate prefixes data.
+  {
+    const auto numPages =
+        memory::AllocationTraits::numPages(numRows * entrySize);
+    pool_->allocateContiguous(numPages, prefixAllocation);
+  }
+  char* const prefixes = prefixAllocation.data<char>();
+
+  // 2. Extract rows to prefixes with row address.
+  for (auto i = 0; i < rows.size(); ++i) {
+    extractRowToPrefix(rows[i], prefixes + entrySize * i);
+  }
+
+  // 3. Sort prefixes with row address.
+  {
+    const auto swapBuffer = AlignedBuffer::allocate<char>(entrySize, pool_);
+    PrefixSortRunner sortRunner(entrySize, swapBuffer->asMutable<char>());
+    const auto start = prefixes;
+    const auto end = prefixes + numRows * entrySize;
+    if (sortLayout_.hasNonNormalizedKey) {
+      sortRunner.quickSort(start, end, [&](char* a, char* b) {
+        return comparePartNormalizedKeys(a, b);
+      });
+    } else {
+      sortRunner.quickSort(start, end, [&](char* a, char* b) {
+        return compareAllNormalizedKeys(a, b);
+      });
+    }
+  }
+  // 4. Output sorted row addresses.
+  for (int i = 0; i < rows.size(); i++) {
+    rows[i] = getAddressFromPrefix(prefixes + i * entrySize);
+  }
+}
+
+} // namespace facebook::velox::exec
diff --git a/velox/exec/PrefixSort.h b/velox/exec/PrefixSort.h
new file mode 100644
index 000000000000..37bdaca36951
--- /dev/null
+++ b/velox/exec/PrefixSort.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "velox/common/memory/MemoryAllocator.h"
+#include "velox/exec/RowContainer.h"
+#include "velox/exec/prefixsort/PrefixSortAlgorithm.h"
+#include "velox/exec/prefixsort/PrefixSortEncoder.h"
+
+namespace facebook::velox::exec {
+
+namespace detail {
+
+FOLLY_ALWAYS_INLINE void stdSort(
+    std::vector<char*>& rows,
+    RowContainer* rowContainer,
+    const std::vector<CompareFlags>& compareFlags) {
+  std::sort(
+      rows.begin(), rows.end(), [&](const char* leftRow, const char* rightRow) {
+        for (auto i = 0; i < compareFlags.size(); ++i) {
+          if (auto result = rowContainer->compare(
+                  leftRow, rightRow, i, compareFlags[i])) {
+            return result < 0;
+          }
+        }
+        return false;
+      });
+}
+}; // namespace detail
+
+struct PrefixSortConfig {
+  PrefixSortConfig(uint32_t maxNormalizedKeySize, uint32_t threshold = 130)
+      : maxNormalizedKeySize(maxNormalizedKeySize), threshold(threshold) {}
+
+  /// Max number of bytes can store normalized keys in prefix-sort buffer per
+  /// entry.
+  const uint32_t maxNormalizedKeySize;
+
+  /// PrefixSort will have performance regression when the dateset is too small.
+  /// The threshold is set to 100 according to the benchmark test results by
+  /// default.
+  const int64_t threshold;
+};
+
+/// The layout of prefix-sort buffer, a prefix entry includes:
+/// 1. normalized keys
+/// 2. non-normalized data ptr for semi-normalized types such as
+/// string_view`s ptr, it will be filled when support Varchar.
+/// 3. the row address ptr point to RowContainer`s rows is added at the end of
+/// prefix.
+struct PrefixSortLayout {
+  /// Number of bytes to store a prefix, it equals to:
+  /// normalizedKeySize_ + 8 (non-normalized-ptr) + 8(row address).
+  const uint64_t entrySize;
+
+  /// If a sort key supports normalization and can be added to the prefix
+  /// sort buffer, it is called a normalized key.
+  const uint32_t normalizedBufferSize;
+
+  const uint32_t numNormalizedKeys;
+
+  /// The num of sort keys include normalized and non-normalized.
+  const uint32_t numKeys;
+
+  /// CompareFlags of all sort keys.
+  const std::vector<CompareFlags> compareFlags;
+
+  /// Whether the sort keys contains normalized key.
+  /// It equals to 'numNormalizedKeys == 0', a little faster.
+  const bool noNormalizedKeys;
+
+  /// Whether the sort keys contains non-normalized key.
+  const bool hasNonNormalizedKey;
+
+  /// Offsets of normalized keys, used to find write locations when
+  /// extracting columns
+  const std::vector<uint32_t> prefixOffsets;
+
+  /// The encoders for normalized keys.
+  const std::vector<prefixsort::PrefixSortEncoder> encoders;
+
+  /// Align the buffer size to 8 so that long compare can replace byte compare
+  /// during ‘memcmp’
+  const int32_t padding;
+
+  static PrefixSortLayout makeSortLayout(
+      const std::vector<TypePtr>& types,
+      const std::vector<CompareFlags>& compareFlags,
+      uint32_t maxNormalizedKeySize);
+};
+
+class PrefixSort {
+ public:
+  PrefixSort(
+      memory::MemoryPool* pool,
+      RowContainer* rowContainer,
+      const std::vector<CompareFlags>& keyCompareFlags,
+      const PrefixSortConfig& config,
+      const PrefixSortLayout& sortLayout);
+
+  /// Follow the steps below to sort the data in RowContainer:
+  /// 1. Allocate a contiguous block of memory to store normalized keys.
+  /// 2. Extract the sort keys from the RowContainer. If the key can be
+  /// normalized, normalize it. For this kind of keys can be normalized，we
+  /// combine them with the original row address ptr and store them
+  /// together into a buffer, called 'Prefix'.
+  /// 3. Sort the prefixes data we got in step 2.
+  /// For keys can normalized(All fixed width types), we use 'memcmp' to compare
+  /// the normalized binary string.
+  /// For keys can not normalized, we use RowContainer`s compare method to
+  /// compare value.
+  /// For keys can part-normalized(Varchar, Row etc.), we will store the
+  /// normalized part and points to raw data in prefix, and custom the points
+  /// compare. The compare strategy will be defined in PrefixSortLayout as
+  /// follow-up, we treat this part as non-normalized until we implement all
+  /// fixed width types.
+  /// For complex types, e.g. ROW that can be converted to scalar types will be
+  /// supported.
+  /// 4. Extract the original row address ptr from prefixes (previously stored
+  /// them in the prefix buffer) into the input rows vector.
+  ///
+  /// @param rows The result of RowContainer::listRows(), assuming that the
+  /// caller (SortBuffer etc.) has already got the result.
+  FOLLY_ALWAYS_INLINE static void sort(
+      std::vector<char*>& rows,
+      memory::MemoryPool* pool,
+      RowContainer* rowContainer,
+      const std::vector<CompareFlags>& compareFlags,
+      const PrefixSortConfig& config) {
+    if (rowContainer->numRows() < config.threshold) {
+      detail::stdSort(rows, rowContainer, compareFlags);
+      return;
+    }
+    VELOX_DCHECK_EQ(rowContainer->keyTypes().size(), compareFlags.size());
+    const auto sortLayout = PrefixSortLayout::makeSortLayout(
+        rowContainer->keyTypes(), compareFlags, config.maxNormalizedKeySize);
+    // All keys can not normalize, skip the binary string compare opt.
+    // Putting this outside sort-internal helps with inline std-sort.
+    if (sortLayout.noNormalizedKeys) {
+      detail::stdSort(rows, rowContainer, compareFlags);
+      return;
+    }
+
+    PrefixSort prefixSort(pool, rowContainer, compareFlags, config, sortLayout);
+    prefixSort.sortInternal(rows);
+  }
+
+ private:
+  void sortInternal(std::vector<char*>& rows);
+
+  int compareAllNormalizedKeys(char* left, char* right);
+
+  int comparePartNormalizedKeys(char* left, char* right);
+
+  void extractRowToPrefix(char* row, char* prefix);
+
+  // Return the reference of row address ptr for read/write.
+  FOLLY_ALWAYS_INLINE char*& getAddressFromPrefix(char* prefix) {
+    return *reinterpret_cast<char**>(prefix + sortLayout_.normalizedBufferSize);
+  }
+
+  memory::MemoryPool* const pool_;
+  const PrefixSortLayout sortLayout_;
+  RowContainer* const rowContainer_;
+};
+} // namespace facebook::velox::exec
diff --git a/velox/exec/benchmarks/CMakeLists.txt b/velox/exec/benchmarks/CMakeLists.txt
index 92fd47a6bb60..eac595cbadf0 100644
--- a/velox/exec/benchmarks/CMakeLists.txt
+++ b/velox/exec/benchmarks/CMakeLists.txt
@@ -49,3 +49,8 @@ if(${VELOX_ENABLE_PARQUET})
     arrow
     thrift)
 endif()
+
+add_executable(velox_prefixsort_benchmark PrefixSortBenchmark.cpp)
+
+target_link_libraries(velox_prefixsort_benchmark velox_exec velox_vector_fuzzer
+                      velox_vector_test_lib ${FOLLY_BENCHMARK})
diff --git a/velox/exec/benchmarks/PrefixSortBenchmark.cpp b/velox/exec/benchmarks/PrefixSortBenchmark.cpp
new file mode 100644
index 000000000000..4edde03fa30b
--- /dev/null
+++ b/velox/exec/benchmarks/PrefixSortBenchmark.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "glog/logging.h"
+#include "velox/exec/PrefixSort.h"
+#include "velox/vector/fuzzer/VectorFuzzer.h"
+
+using namespace facebook::velox;
+using namespace facebook::velox::exec;
+
+namespace {
+
+class TestCase {
+ public:
+  TestCase(
+      memory::MemoryPool* pool,
+      const std::string& testName,
+      size_t numRows,
+      const RowTypePtr& rowType,
+      int numKeys)
+      : testName_(testName), numRows_(numRows), pool_(pool), rowType_(rowType) {
+    // Initialize a RowContainer that holds fuzzed rows to be sorted.
+    std::vector<TypePtr> keyTypes;
+    std::vector<TypePtr> dependentTypes;
+    for (auto i = 0; i < rowType->size(); ++i) {
+      if (i < numKeys) {
+        keyTypes.push_back(rowType->childAt(i));
+      } else {
+        dependentTypes.push_back(rowType->childAt(i));
+      }
+    }
+    data_ = std::make_unique<RowContainer>(keyTypes, dependentTypes, pool);
+    RowVectorPtr sortedRows = fuzzRows(numRows, numKeys);
+    storeRows(numRows, sortedRows);
+
+    // Initialize CompareFlags, it could be same for each key in benchmark.
+    for (int i = 0; i < numKeys; ++i) {
+      compareFlags_.push_back(
+          {true, true, false, CompareFlags::NullHandlingMode::kNullAsValue});
+    }
+  };
+
+  const std::string& testName() const {
+    return testName_;
+  }
+
+  size_t numRows() const {
+    return numRows_;
+  }
+
+  const std::vector<char*>& rows() const {
+    return rows_;
+  }
+
+  RowContainer* rowContainer() const {
+    return data_.get();
+  }
+
+  const std::vector<CompareFlags>& compareFlags() const {
+    return compareFlags_;
+  }
+
+ private:
+  // Store data into the RowContainer to mock the behavior of SortBuffer.
+  void storeRows(int numRows, const RowVectorPtr& data) {
+    rows_.resize(numRows);
+    for (auto row = 0; row < numRows; ++row) {
+      rows_[row] = rowContainer()->newRow();
+    }
+    for (auto column = 0; column < data->childrenSize(); ++column) {
+      DecodedVector decoded(*data->childAt(column));
+      for (int i = 0; i < numRows; ++i) {
+        char* row = rows_[i];
+        rowContainer()->store(decoded, i, row, column);
+      }
+    }
+  }
+
+  RowVectorPtr fuzzRows(size_t numRows, int numKeys) {
+    VectorFuzzer fuzzer({.vectorSize = numRows}, pool_);
+    VectorFuzzer fuzzerWithNulls(
+        {.vectorSize = numRows, .nullRatio = 0.7}, pool_);
+    std::vector<VectorPtr> children;
+
+    // Fuzz keys: for front keys (column 0 to numKeys -2) use high
+    // nullRatio to enforce all columns to be compared.
+    {
+      for (auto i = 0; i < numKeys - 1; ++i) {
+        children.push_back(fuzzerWithNulls.fuzz(rowType_->childAt(i)));
+      }
+      children.push_back(fuzzer.fuzz(rowType_->childAt(numKeys - 1)));
+    }
+    // Fuzz payload
+    {
+      for (auto i = numKeys; i < rowType_->size(); ++i) {
+        children.push_back(fuzzer.fuzz(rowType_->childAt(i)));
+      }
+    }
+    return std::make_shared<RowVector>(
+        pool_, rowType_, nullptr, numRows, std::move(children));
+  }
+
+  const std::string testName_;
+  const size_t numRows_;
+  // Rows address stored in RowContainer
+  std::vector<char*> rows_;
+  std::unique_ptr<RowContainer> data_;
+  memory::MemoryPool* const pool_;
+  const RowTypePtr rowType_;
+  std::vector<CompareFlags> compareFlags_;
+};
+
+// You could config threshold, e.i. 0, to test prefix-sort for small
+// dateset.
+static const PrefixSortConfig kDefaultSortConfig(1024, 100);
+
+// For small dataset, in some test environments, if std-sort is defined in the
+// benchmark file, the test results may be strangely regressed. When the
+// threshold is particularly large, PrefixSort is actually std-sort, hence, we
+// can use this as std-sort benchmark base.
+static const PrefixSortConfig kStdSortConfig(
+    1024,
+    std::numeric_limits<int>::max());
+
+class PrefixSortBenchmark {
+ public:
+  PrefixSortBenchmark(memory::MemoryPool* pool) : pool_(pool) {}
+
+  void runPrefixSort(
+      const std::vector<char*>& rows,
+      RowContainer* rowContainer,
+      const std::vector<CompareFlags>& compareFlags) {
+    // Copy rows to avoid sort rows already sorted.
+    std::vector<char*> sortedRows = rows;
+    PrefixSort::sort(
+        sortedRows, pool_, rowContainer, compareFlags, kDefaultSortConfig);
+  }
+
+  void runStdSort(
+      const std::vector<char*>& rows,
+      RowContainer* rowContainer,
+      const std::vector<CompareFlags>& compareFlags) {
+    std::vector<char*> sortedRows = rows;
+    PrefixSort::sort(
+        sortedRows, pool_, rowContainer, compareFlags, kStdSortConfig);
+  }
+
+  // Add benchmark manually to avoid writing a lot of BENCHMARK.
+  void addBenchmark(
+      const std::string& testName,
+      size_t numRows,
+      const RowTypePtr& rowType,
+      int iterations,
+      int numKeys,
+      bool testStdSort) {
+    auto testCase =
+        std::make_unique<TestCase>(pool_, testName, numRows, rowType, numKeys);
+    // Add benchmarks for std-sort and prefix-sort.
+    {
+      if (testStdSort) {
+        folly::addBenchmark(
+            __FILE__,
+            "StdSort_" + testCase->testName(),
+            [rows = testCase->rows(),
+             container = testCase->rowContainer(),
+             sortFlags = testCase->compareFlags(),
+             iterations = iterations,
+             this]() {
+              for (auto i = 0; i < iterations; ++i) {
+                runStdSort(rows, container, sortFlags);
+              }
+              return rows.size() * iterations;
+            });
+      }
+      folly::addBenchmark(
+          __FILE__,
+          testStdSort ? "%PrefixSort" : "PrefixSort_" + testCase->testName(),
+          [rows = testCase->rows(),
+           container = testCase->rowContainer(),
+           sortFlags = testCase->compareFlags(),
+           iterations = iterations,
+           this]() {
+            for (auto i = 0; i < iterations; ++i) {
+              runPrefixSort(rows, container, sortFlags);
+            }
+            return rows.size() * iterations;
+          });
+    }
+    testCases_.push_back(std::move(testCase));
+  }
+
+  void benchmark(
+      const std::string& prefix,
+      const std::string& keyName,
+      const std::vector<vector_size_t>& batchSizes,
+      const std::vector<RowTypePtr>& rowTypes,
+      const std::vector<int>& numKeys,
+      int32_t iterations,
+      bool testStdSort = true) {
+    for (auto batchSize : batchSizes) {
+      for (auto i = 0; i < rowTypes.size(); ++i) {
+        const auto name = fmt::format(
+            "{}_{}_{}_{}k", prefix, numKeys[i], keyName, batchSize / 1000.0);
+        addBenchmark(
+            name, batchSize, rowTypes[i], iterations, numKeys[i], testStdSort);
+      }
+    }
+  }
+
+  std::vector<RowTypePtr> bigintRowTypes(bool noPayload) {
+    if (noPayload) {
+      return {
+          ROW({BIGINT()}),
+          ROW({BIGINT(), BIGINT()}),
+          ROW({BIGINT(), BIGINT(), BIGINT()}),
+          ROW({BIGINT(), BIGINT(), BIGINT(), BIGINT()}),
+      };
+    } else {
+      return {
+          ROW({BIGINT(), VARCHAR(), VARCHAR()}),
+          ROW({BIGINT(), BIGINT(), VARCHAR(), VARCHAR()}),
+          ROW({BIGINT(), BIGINT(), BIGINT(), VARCHAR(), VARCHAR()}),
+          ROW({BIGINT(), BIGINT(), BIGINT(), BIGINT(), VARCHAR(), VARCHAR()}),
+      };
+    }
+  }
+
+  void bigint(
+      bool noPayload,
+      int numIterations,
+      const std::vector<vector_size_t>& batchSizes) {
+    std::vector<RowTypePtr> rowTypes = bigintRowTypes(noPayload);
+    std::vector<int> numKeys = {1, 2, 3, 4};
+    benchmark(
+        noPayload ? "no-payload" : "payload",
+        "bigint",
+        batchSizes,
+        rowTypes,
+        numKeys,
+        numIterations);
+  }
+
+  void smallBigint() {
+    // For small dateset, iterations need to be large enough to ensure that the
+    // benchmark runs for enough time.
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    bigint(true, iterations, batchSizes);
+  }
+
+  void smallBigintWithPayload() {
+    const auto iterations = 100'000;
+    const std::vector<vector_size_t> batchSizes = {10, 50, 100, 500};
+    bigint(false, iterations, batchSizes);
+  }
+
+  void largeBigint() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    bigint(true, iterations, batchSizes);
+  }
+
+  void largeBigintWithPayloads() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    bigint(false, iterations, batchSizes);
+  }
+
+  void largeVarchar() {
+    const auto iterations = 10;
+    const std::vector<vector_size_t> batchSizes = {
+        1'000, 10'000, 100'000, 1'000'000};
+    std::vector<RowTypePtr> rowTypes = {
+        ROW({VARCHAR()}),
+        ROW({VARCHAR(), VARCHAR()}),
+        ROW({VARCHAR(), VARCHAR(), VARCHAR()}),
+        ROW({VARCHAR(), VARCHAR(), VARCHAR(), VARCHAR()}),
+    };
+    std::vector<int> numKeys = {1, 2, 3, 4};
+    benchmark(
+        "no-payloads", "varchar", batchSizes, rowTypes, numKeys, iterations);
+  }
+
+ private:
+  std::vector<std::unique_ptr<TestCase>> testCases_;
+  memory::MemoryPool* pool_;
+};
+} // namespace
+
+int main(int argc, char** argv) {
+  folly::Init init(&argc, &argv);
+
+  memory::MemoryManager::initialize({});
+  auto rootPool = memory::memoryManager()->addRootPool();
+  auto leafPool = rootPool->addLeafChild("leaf");
+
+  PrefixSortBenchmark bm(leafPool.get());
+
+  bm.smallBigint();
+  bm.largeBigint();
+  bm.largeBigintWithPayloads();
+  bm.smallBigintWithPayload();
+  bm.largeVarchar();
+  folly::runBenchmarks();
+
+  return 0;
+}
diff --git a/velox/exec/prefixsort/PrefixSortEncoder.h b/velox/exec/prefixsort/PrefixSortEncoder.h
index 7408390ecabf..1323c43a4eb9 100644
--- a/velox/exec/prefixsort/PrefixSortEncoder.h
+++ b/velox/exec/prefixsort/PrefixSortEncoder.h
@@ -23,6 +23,7 @@
 #include "velox/common/base/Exceptions.h"
 #include "velox/common/base/SimdUtil.h"
 #include "velox/type/Timestamp.h"
+#include "velox/type/Type.h"
 
 namespace facebook::velox::exec::prefixsort {
 
@@ -65,6 +66,31 @@ class PrefixSortEncoder {
     return nullsFirst_;
   }
 
+  /// @return For supported types, returns the encoded size, assume nullable.
+  ///         For not supported types, returns 'std::nullopt'.
+  FOLLY_ALWAYS_INLINE static std::optional<uint32_t> encodedSize(
+      TypeKind typeKind) {
+    switch ((typeKind)) {
+      case ::facebook::velox::TypeKind::INTEGER: {
+        return 5;
+      }
+      case ::facebook::velox::TypeKind::BIGINT: {
+        return 9;
+      }
+      case ::facebook::velox::TypeKind::REAL: {
+        return 5;
+      }
+      case ::facebook::velox::TypeKind::DOUBLE: {
+        return 9;
+      }
+      case ::facebook::velox::TypeKind::TIMESTAMP: {
+        return 17;
+      }
+      default:
+        return std::nullopt;
+    }
+  }
+
  private:
   const bool ascending_;
   const bool nullsFirst_;
diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt
index 263797e36019..6310df860d5d 100644
--- a/velox/exec/tests/CMakeLists.txt
+++ b/velox/exec/tests/CMakeLists.txt
@@ -81,7 +81,8 @@ add_executable(
   ValuesTest.cpp
   WindowFunctionRegistryTest.cpp
   WindowTest.cpp
-  SortBufferTest.cpp)
+  SortBufferTest.cpp
+  PrefixSortTest.cpp)
 
 add_executable(
   velox_exec_infra_test
diff --git a/velox/exec/tests/PrefixSortTest.cpp b/velox/exec/tests/PrefixSortTest.cpp
new file mode 100644
index 000000000000..ad09e28373a1
--- /dev/null
+++ b/velox/exec/tests/PrefixSortTest.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "velox/exec/PrefixSort.h"
+#include "velox/exec/tests/utils/OperatorTestBase.h"
+
+namespace facebook::velox::exec::prefixsort::test {
+namespace {
+
+class PrefixSortTest : public exec::test::OperatorTestBase {
+ protected:
+  std::vector<char*>
+  storeRows(int numRows, const RowVectorPtr& sortedRows, RowContainer* data);
+
+  static constexpr CompareFlags kAsc{
+      true,
+      true,
+      false,
+      CompareFlags::NullHandlingMode::kNullAsValue};
+
+  static constexpr CompareFlags kDesc{
+      true,
+      false,
+      false,
+      CompareFlags::NullHandlingMode::kNullAsValue};
+
+  void testPrefixSort(
+      const std::vector<CompareFlags>& compareFlags,
+      const RowVectorPtr& data) {
+    const auto numRows = data->size();
+    const auto expectedResult =
+        generateExpectedResult(compareFlags, numRows, data);
+
+    const auto rowType = asRowType(data->type());
+
+    // Store data in a RowContainer.
+    const std::vector<TypePtr> keyTypes{
+        rowType->children().begin(),
+        rowType->children().begin() + compareFlags.size()};
+    const std::vector<TypePtr> payloadTypes{
+        rowType->children().begin() + compareFlags.size(),
+        rowType->children().end()};
+
+    RowContainer rowContainer(keyTypes, payloadTypes, pool_.get());
+    std::vector<char*> rows = storeRows(numRows, data, &rowContainer);
+
+    // Use PrefixSort to sort rows.
+    PrefixSort::sort(
+        rows,
+        pool_.get(),
+        &rowContainer,
+        compareFlags,
+        {1024,
+         // Set threshold to 0 to enable prefix-sort in small dataset.
+         0});
+
+    // Extract data from the RowContainer in order.
+    const RowVectorPtr actual =
+        BaseVector::create<RowVector>(rowType, numRows, pool_.get());
+    for (int column = 0; column < compareFlags.size(); ++column) {
+      rowContainer.extractColumn(
+          rows.data(), numRows, column, actual->childAt(column));
+    }
+
+    velox::test::assertEqualVectors(actual, expectedResult);
+  }
+
+ private:
+  // Use std::sort to generate expected result.
+  const RowVectorPtr generateExpectedResult(
+      const std::vector<CompareFlags>& compareFlags,
+      int numRows,
+      const RowVectorPtr& sortedRows);
+};
+
+std::vector<char*> PrefixSortTest::storeRows(
+    int numRows,
+    const RowVectorPtr& sortedRows,
+    RowContainer* data) {
+  std::vector<char*> rows;
+  SelectivityVector allRows(numRows);
+  rows.resize(numRows);
+  for (int row = 0; row < numRows; ++row) {
+    rows[row] = data->newRow();
+  }
+  for (int column = 0; column < sortedRows->childrenSize(); ++column) {
+    DecodedVector decoded(*sortedRows->childAt(column), allRows);
+    for (int i = 0; i < numRows; ++i) {
+      char* row = rows[i];
+      data->store(decoded, i, row, column);
+    }
+  }
+  return rows;
+}
+
+const RowVectorPtr PrefixSortTest::generateExpectedResult(
+    const std::vector<CompareFlags>& compareFlags,
+    int numRows,
+    const RowVectorPtr& sortedRows) {
+  const auto rowType = asRowType(sortedRows->type());
+  const int numKeys = compareFlags.size();
+  RowContainer rowContainer(rowType->children(), pool_.get());
+  std::vector<char*> rows = storeRows(numRows, sortedRows, &rowContainer);
+
+  std::sort(
+      rows.begin(), rows.end(), [&](const char* leftRow, const char* rightRow) {
+        for (auto i = 0; i < numKeys; ++i) {
+          if (auto result =
+                  rowContainer.compare(leftRow, rightRow, i, compareFlags[i])) {
+            return result < 0;
+          }
+        }
+        return false;
+      });
+
+  const RowVectorPtr result =
+      BaseVector::create<RowVector>(rowType, numRows, pool_.get());
+  for (int column = 0; column < compareFlags.size(); ++column) {
+    rowContainer.extractColumn(
+        rows.data(), numRows, column, result->childAt(column));
+  }
+  return result;
+}
+
+TEST_F(PrefixSortTest, singleKey) {
+  const int numRows = 5;
+  const int columnsSize = 7;
+
+  // Vectors without nulls.
+  const std::vector<VectorPtr> testData = {
+      makeFlatVector<int64_t>({5, 4, 3, 2, 1}),
+      makeFlatVector<int32_t>({5, 4, 3, 2, 1}),
+      makeFlatVector<int16_t>({5, 4, 3, 2, 1}),
+      makeFlatVector<float>({5.5, 4.4, 3.3, 2.2, 1.1}),
+      makeFlatVector<double>({5.5, 4.4, 3.3, 2.2, 1.1}),
+      makeFlatVector<Timestamp>(
+          {Timestamp(5, 5),
+           Timestamp(4, 4),
+           Timestamp(3, 3),
+           Timestamp(2, 2),
+           Timestamp(1, 1)}),
+      makeFlatVector<std::string_view>({"eee", "ddd", "ccc", "bbb", "aaa"})};
+  for (int i = 5; i < columnsSize; ++i) {
+    const auto data = makeRowVector({testData[i]});
+
+    testPrefixSort({kAsc}, data);
+    testPrefixSort({kDesc}, data);
+  }
+}
+
+TEST_F(PrefixSortTest, singleKeyWithNulls) {
+  const int numRows = 5;
+  const int columnsSize = 7;
+
+  Timestamp ts = {5, 5};
+  // Vectors with nulls.
+  const std::vector<VectorPtr> testData = {
+      makeNullableFlatVector<int64_t>({5, 4, std::nullopt, 2, 1}),
+      makeNullableFlatVector<int32_t>({5, 4, std::nullopt, 2, 1}),
+      makeNullableFlatVector<int16_t>({5, 4, std::nullopt, 2, 1}),
+      makeNullableFlatVector<float>({5.5, 4.4, std::nullopt, 2.2, 1.1}),
+      makeNullableFlatVector<double>({5.5, 4.4, std::nullopt, 2.2, 1.1}),
+      makeNullableFlatVector<Timestamp>(
+          {Timestamp(5, 5),
+           Timestamp(4, 4),
+           std::nullopt,
+           Timestamp(2, 2),
+           Timestamp(1, 1)}),
+      makeNullableFlatVector<std::string_view>(
+          {"eee", "ddd", std::nullopt, "bbb", "aaa"})};
+
+  for (int i = 5; i < columnsSize; ++i) {
+    const auto data = makeRowVector({testData[i]});
+
+    testPrefixSort({kAsc}, data);
+    testPrefixSort({kDesc}, data);
+  }
+}
+
+TEST_F(PrefixSortTest, multipleKeys) {
+  // Test all keys normalized : bigint, integer
+  {
+    const auto data = makeRowVector({
+        makeNullableFlatVector<int64_t>({5, 2, std::nullopt, 2, 1}),
+        makeNullableFlatVector<int32_t>({5, 4, std::nullopt, 2, 1}),
+    });
+
+    testPrefixSort({kAsc, kAsc}, data);
+    testPrefixSort({kDesc, kDesc}, data);
+  }
+
+  // Test keys with semi-normalized : bigint, varchar
+  {
+    const auto data = makeRowVector({
+        makeNullableFlatVector<int64_t>({5, 2, std::nullopt, 2, 1}),
+        makeNullableFlatVector<std::string_view>(
+            {"eee", "ddd", std::nullopt, "bbb", "aaa"}),
+    });
+
+    testPrefixSort({kAsc, kAsc}, data);
+    testPrefixSort({kDesc, kDesc}, data);
+  }
+}
+
+TEST_F(PrefixSortTest, fuzz) {
+  std::vector<TypePtr> allTypes = {
+      INTEGER(),
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      BIGINT(),
+      HUGEINT(),
+      REAL(),
+      DOUBLE(),
+      TIMESTAMP(),
+      VARCHAR(),
+      VARBINARY()};
+  const int numRows = 10240;
+  for (const auto& type : allTypes) {
+    SCOPED_TRACE(fmt::format("{}", type->toString()));
+    VectorFuzzer fuzzer({.vectorSize = numRows, .nullRatio = 0.1}, pool());
+    RowVectorPtr data = fuzzer.fuzzRow(ROW({type}));
+
+    testPrefixSort({kAsc}, data);
+    testPrefixSort({kDesc}, data);
+  }
+}
+
+TEST_F(PrefixSortTest, fuzzMulti) {
+  std::vector<TypePtr> allTypes = {
+      INTEGER(),
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      BIGINT(),
+      HUGEINT(),
+      REAL(),
+      DOUBLE(),
+      TIMESTAMP(),
+      VARCHAR(),
+      VARBINARY()};
+  const int32_t numRows = 10240;
+  const TypePtr payload = VARCHAR();
+  VectorFuzzer fuzzer({.vectorSize = numRows, .nullRatio = 0.1}, pool());
+  for (const auto& type1 : allTypes) {
+    for (const auto& type2 : allTypes) {
+      SCOPED_TRACE(fmt::format("{}, {}", type1->toString(), type2->toString()));
+      RowVectorPtr data = fuzzer.fuzzRow(ROW({type1, type2, payload}));
+
+      testPrefixSort({kAsc, kAsc}, data);
+      testPrefixSort({kDesc, kDesc}, data);
+    }
+  }
+}
+} // namespace
+} // namespace facebook::velox::exec::prefixsort::test