Skip to content

Commit

Permalink
Add prefix-sort with support for fixed width sorting keys (#8146)
Browse files Browse the repository at this point in the history
Summary:
PrefixSort is used to improve in-memory sort performance, using memcmp to compare binary string (normalized encoded sort keys called 'prefix') when sorting. This PR adds 'extract rows to prefix'  and 'sort' for fixed width types (int32, int64, float, double, timestamp etc.). More types will be added in a follow-up.

Add benchmark to compare std::sort vs. prefix sort using 1, 2, 3, and 4 bigint sorting keys.

The performance of sorting up to 1000 rows is the same. When sorting more than 1K rows prefix-sort is faster, the gains increase as the number of rows sorted and the number of sorting keys increases. The presence of the payload columns doesn't affect the performance.

```
============================================================================
[...]ec/benchmarks/PrefixSortBenchmark.cpp     relative  time/iter   iters/s
============================================================================
StdSort_no-payload_1_bigint_0.01k                          49.40ns    20.24M
PrefixSort                                      100.00%    49.40ns    20.24M
StdSort_no-payload_2_bigint_0.01k                          87.85ns    11.38M
PrefixSort                                      99.904%    87.94ns    11.37M
StdSort_no-payload_3_bigint_0.01k                          65.04ns    15.37M
PrefixSort                                      99.350%    65.47ns    15.27M
StdSort_no-payload_4_bigint_0.01k                          69.19ns    14.45M
PrefixSort                                      99.566%    69.49ns    14.39M
StdSort_no-payload_1_bigint_0.015k                         47.93ns    20.86M
PrefixSort                                      100.02%    47.92ns    20.87M
StdSort_no-payload_2_bigint_0.015k                         54.22ns    18.44M
PrefixSort                                      99.921%    54.26ns    18.43M
StdSort_no-payload_3_bigint_0.015k                         61.00ns    16.39M
PrefixSort                                      99.958%    61.02ns    16.39M
StdSort_no-payload_4_bigint_0.015k                         57.38ns    17.43M
PrefixSort                                      99.870%    57.46ns    17.40M
StdSort_no-payload_1_bigint_0.02k                          47.82ns    20.91M
PrefixSort                                      99.914%    47.86ns    20.90M
StdSort_no-payload_2_bigint_0.02k                          83.94ns    11.91M
PrefixSort                                      100.00%    83.93ns    11.91M
StdSort_no-payload_3_bigint_0.02k                         128.43ns     7.79M
PrefixSort                                      100.17%   128.21ns     7.80M
StdSort_no-payload_4_bigint_0.02k                         165.29ns     6.05M
PrefixSort                                      99.997%   165.30ns     6.05M
StdSort_no-payload_1_bigint_0.05k                          77.34ns    12.93M
PrefixSort                                      99.425%    77.79ns    12.86M
StdSort_no-payload_2_bigint_0.05k                         113.79ns     8.79M
PrefixSort                                      99.786%   114.03ns     8.77M
StdSort_no-payload_3_bigint_0.05k                         152.81ns     6.54M
PrefixSort                                      99.696%   153.27ns     6.52M
StdSort_no-payload_4_bigint_0.05k                         185.93ns     5.38M
PrefixSort                                      100.02%   185.90ns     5.38M
StdSort_no-payload_1_bigint_0.1k                           87.39ns    11.44M
PrefixSort                                      99.499%    87.83ns    11.39M
StdSort_no-payload_2_bigint_0.1k                          139.93ns     7.15M
PrefixSort                                      162.24%    86.25ns    11.59M
StdSort_no-payload_3_bigint_0.1k                          186.27ns     5.37M
PrefixSort                                      186.72%    99.76ns    10.02M
StdSort_no-payload_4_bigint_0.1k                          234.01ns     4.27M
PrefixSort                                      187.97%   124.49ns     8.03M
StdSort_no-payloads_1_bigint_1k                           173.31ns     5.77M
PrefixSort                                      136.72%   126.76ns     7.89M
StdSort_no-payloads_2_bigint_1k                           249.77ns     4.00M
PrefixSort                                      199.49%   125.20ns     7.99M
StdSort_no-payloads_3_bigint_1k                           314.18ns     3.18M
PrefixSort                                      219.49%   143.14ns     6.99M
StdSort_no-payloads_4_bigint_1k                           348.38ns     2.87M
PrefixSort                                      203.28%   171.38ns     5.84M
StdSort_no-payloads_1_bigint_10k                          251.90ns     3.97M
PrefixSort                                      165.99%   151.76ns     6.59M
StdSort_no-payloads_2_bigint_10k                          363.09ns     2.75M
PrefixSort                                      253.07%   143.47ns     6.97M
StdSort_no-payloads_3_bigint_10k                          483.58ns     2.07M
PrefixSort                                      293.67%   164.67ns     6.07M
StdSort_no-payloads_4_bigint_10k                          593.29ns     1.69M
PrefixSort                                      312.83%   189.65ns     5.27M
StdSort_no-payloads_1_bigint_100k                         330.44ns     3.03M
PrefixSort                                      192.57%   171.59ns     5.83M
StdSort_no-payloads_2_bigint_100k                         470.79ns     2.12M
PrefixSort                                      293.67%   160.31ns     6.24M
StdSort_no-payloads_3_bigint_100k                         607.15ns     1.65M
PrefixSort                                      303.88%   199.80ns     5.01M
StdSort_no-payloads_4_bigint_100k                         706.03ns     1.42M
PrefixSort                                      315.15%   224.03ns     4.46M
StdSort_no-payloads_1_bigint_1000k                        452.05ns     2.21M
PrefixSort                                      204.92%   220.60ns     4.53M
StdSort_no-payloads_2_bigint_1000k                        645.35ns     1.55M
PrefixSort                                      306.42%   210.61ns     4.75M
StdSort_no-payloads_3_bigint_1000k                        818.78ns     1.22M
PrefixSort                                      328.01%   249.62ns     4.01M
StdSort_no-payloads_4_bigint_1000k                        981.65ns     1.02M
PrefixSort                                      343.79%   285.54ns     3.50M
StdSort_2payloads_1_bigint_1k                             177.21ns     5.64M
PrefixSort                                      139.94%   126.63ns     7.90M
StdSort_2payloads_2_bigint_1k                             248.46ns     4.02M
PrefixSort                                      199.08%   124.80ns     8.01M
StdSort_2payloads_3_bigint_1k                             313.66ns     3.19M
PrefixSort                                      218.48%   143.56ns     6.97M
StdSort_2payloads_4_bigint_1k                             359.17ns     2.78M
PrefixSort                                      208.57%   172.21ns     5.81M
StdSort_2payloads_1_bigint_10k                            254.83ns     3.92M
PrefixSort                                      168.28%   151.43ns     6.60M
StdSort_2payloads_2_bigint_10k                            363.92ns     2.75M
PrefixSort                                      254.35%   143.08ns     6.99M
StdSort_2payloads_3_bigint_10k                            475.61ns     2.10M
PrefixSort                                      288.96%   164.60ns     6.08M
StdSort_2payloads_4_bigint_10k                            594.78ns     1.68M
PrefixSort                                      314.12%   189.35ns     5.28M
StdSort_2payloads_1_bigint_100k                           349.99ns     2.86M
PrefixSort                                      205.28%   170.49ns     5.87M
StdSort_2payloads_2_bigint_100k                           489.03ns     2.04M
PrefixSort                                      307.77%   158.89ns     6.29M
StdSort_2payloads_3_bigint_100k                           607.88ns     1.65M
PrefixSort                                      305.62%   198.90ns     5.03M
StdSort_2payloads_4_bigint_100k                           715.90ns     1.40M
PrefixSort                                      321.73%   222.52ns     4.49M
StdSort_2payloads_1_bigint_1000k                          574.14ns     1.74M
PrefixSort                                      262.05%   219.10ns     4.56M
StdSort_2payloads_2_bigint_1000k                          796.05ns     1.26M
PrefixSort                                      377.73%   210.75ns     4.75M
StdSort_2payloads_3_bigint_1000k                            1.02us   975.65K
PrefixSort                                      411.02%   249.37ns     4.01M
StdSort_2payloads_4_bigint_1000k                            1.16us   858.48K
PrefixSort                                      408.37%   285.24ns     3.51M
StdSort_2payloads_1_bigint_0.01k                           49.37ns    20.26M
PrefixSort                                      99.124%    49.81ns    20.08M
StdSort_2payloads_2_bigint_0.01k                           89.08ns    11.23M
PrefixSort                                      99.958%    89.12ns    11.22M
StdSort_2payloads_3_bigint_0.01k                           64.40ns    15.53M
PrefixSort                                      99.991%    64.41ns    15.53M
StdSort_2payloads_4_bigint_0.01k                           86.56ns    11.55M
PrefixSort                                      100.34%    86.26ns    11.59M
StdSort_2payloads_1_bigint_0.015k                          48.17ns    20.76M
PrefixSort                                      100.11%    48.12ns    20.78M
StdSort_2payloads_2_bigint_0.015k                          55.44ns    18.04M
PrefixSort                                      99.994%    55.45ns    18.03M
StdSort_2payloads_3_bigint_0.015k                          61.17ns    16.35M
PrefixSort                                      99.988%    61.18ns    16.35M
StdSort_2payloads_4_bigint_0.015k                          57.55ns    17.38M
PrefixSort                                      99.895%    57.61ns    17.36M
StdSort_2payloads_1_bigint_0.02k                           47.93ns    20.86M
PrefixSort                                      99.916%    47.97ns    20.85M
StdSort_2payloads_2_bigint_0.02k                           84.10ns    11.89M
PrefixSort                                      100.38%    83.78ns    11.94M
StdSort_2payloads_3_bigint_0.02k                          126.53ns     7.90M
PrefixSort                                      100.05%   126.47ns     7.91M
StdSort_2payloads_4_bigint_0.02k                          164.44ns     6.08M
PrefixSort                                      99.935%   164.55ns     6.08M
StdSort_2payloads_1_bigint_0.05k                           77.86ns    12.84M
PrefixSort                                      99.171%    78.51ns    12.74M
StdSort_2payloads_2_bigint_0.05k                          118.10ns     8.47M
PrefixSort                                      100.53%   117.48ns     8.51M
StdSort_2payloads_3_bigint_0.05k                          152.74ns     6.55M
PrefixSort                                      100.02%   152.71ns     6.55M
StdSort_2payloads_4_bigint_0.05k                          184.56ns     5.42M
PrefixSort                                      99.925%   184.70ns     5.41M
StdSort_2payloads_1_bigint_0.1k                            88.01ns    11.36M
PrefixSort                                      100.46%    87.60ns    11.42M
StdSort_2payloads_2_bigint_0.1k                           138.22ns     7.24M
PrefixSort                                      159.92%    86.43ns    11.57M
StdSort_2payloads_3_bigint_0.1k                           187.49ns     5.33M
PrefixSort                                      187.96%    99.75ns    10.03M
StdSort_2payloads_4_bigint_0.1k                           232.52ns     4.30M
PrefixSort                                      188.98%   123.04ns     8.13M
StdSort_no-payloads_1_varchar_1k                          292.32ns     3.42M
PrefixSort                                      100.47%   290.96ns     3.44M
StdSort_no-payloads_2_varchar_1k                          380.98ns     2.62M
PrefixSort                                      98.623%   386.29ns     2.59M
StdSort_no-payloads_3_varchar_1k                          456.15ns     2.19M
PrefixSort                                      98.302%   464.03ns     2.16M
StdSort_no-payloads_4_varchar_1k                          520.84ns     1.92M
PrefixSort                                      98.186%   530.46ns     1.89M
StdSort_no-payloads_1_varchar_10k                         422.83ns     2.37M
PrefixSort                                      99.186%   426.30ns     2.35M
StdSort_no-payloads_2_varchar_10k                         495.10ns     2.02M
PrefixSort                                      98.218%   504.08ns     1.98M
StdSort_no-payloads_3_varchar_10k                         584.89ns     1.71M
PrefixSort                                      99.079%   590.33ns     1.69M
StdSort_no-payloads_4_varchar_10k                         667.37ns     1.50M
PrefixSort                                      98.887%   674.88ns     1.48M
StdSort_no-payloads_1_varchar_100k                        605.27ns     1.65M
PrefixSort                                      99.425%   608.78ns     1.64M
StdSort_no-payloads_2_varchar_100k                        741.11ns     1.35M
PrefixSort                                      99.107%   747.78ns     1.34M
StdSort_no-payloads_3_varchar_100k                        890.60ns     1.12M
PrefixSort                                      99.089%   898.78ns     1.11M
StdSort_no-payloads_4_varchar_100k                          1.11us   903.14K
PrefixSort                                      104.50%     1.06us   943.76K
StdSort_no-payloads_1_varchar_1000k                         1.22us   822.83K
PrefixSort                                      99.534%     1.22us   818.99K
StdSort_no-payloads_2_varchar_1000k                         1.52us   656.78K
PrefixSort                                      99.353%     1.53us   652.53K
StdSort_no-payloads_3_varchar_1000k                         1.78us   560.23K
PrefixSort                                      98.862%     1.81us   553.86K
StdSort_no-payloads_4_varchar_1000k                         1.93us   519.34K
PrefixSort                                      99.159%     1.94us   514.97K
```

Part of #6766

Pull Request resolved: #8146

Reviewed By: Yuhta

Differential Revision: D54247347

Pulled By: mbasmanova

fbshipit-source-id: b358925e6b702de5bb2eee55df24827b51268923
  • Loading branch information
skadilover authored and facebook-github-bot committed Mar 13, 2024
1 parent de54d1e commit 85f3973
Show file tree
Hide file tree
Showing 8 changed files with 1,063 additions and 1 deletion.
1 change: 1 addition & 0 deletions velox/exec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ add_library(
NestedLoopJoinProbe.cpp
Operator.cpp
OperatorUtils.cpp
PrefixSort.cpp
OrderBy.cpp
PartitionedOutput.cpp
OutputBuffer.cpp
Expand Down
255 changes: 255 additions & 0 deletions velox/exec/PrefixSort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "velox/exec/PrefixSort.h"

using namespace facebook::velox::exec::prefixsort;

namespace facebook::velox::exec {

namespace {

// For alignment, 8 is faster than 4.
// If the alignment is changed from 8 to 4, you need to change bitswap64
// to bitswap32.
const int32_t kAlignment = 8;

template <typename T>
FOLLY_ALWAYS_INLINE void encodeRowColumn(
const PrefixSortLayout& prefixSortLayout,
const uint32_t index,
const RowColumn& rowColumn,
char* const row,
char* const prefix) {
std::optional<T> value;
if (RowContainer::isNullAt(row, rowColumn.nullByte(), rowColumn.nullMask())) {
value = std::nullopt;
} else {
value = *(reinterpret_cast<T*>(row + rowColumn.offset()));
}
prefixSortLayout.encoders[index].encode(
value, prefix + prefixSortLayout.prefixOffsets[index]);
}

FOLLY_ALWAYS_INLINE void extractRowColumnToPrefix(
TypeKind typeKind,
const PrefixSortLayout& prefixSortLayout,
const uint32_t index,
const RowColumn& rowColumn,
char* const row,
char* const prefix) {
switch (typeKind) {
case TypeKind::INTEGER: {
encodeRowColumn<int32_t>(prefixSortLayout, index, rowColumn, row, prefix);
return;
}
case TypeKind::BIGINT: {
encodeRowColumn<int64_t>(prefixSortLayout, index, rowColumn, row, prefix);
return;
}
case TypeKind::REAL: {
encodeRowColumn<float>(prefixSortLayout, index, rowColumn, row, prefix);
return;
}
case TypeKind::DOUBLE: {
encodeRowColumn<double>(prefixSortLayout, index, rowColumn, row, prefix);
return;
}
case TypeKind::TIMESTAMP: {
encodeRowColumn<Timestamp>(
prefixSortLayout, index, rowColumn, row, prefix);
return;
}
default:
VELOX_UNSUPPORTED(
"prefix-sort does not support type kind: {}",
mapTypeKindToName(typeKind));
}
}

FOLLY_ALWAYS_INLINE int32_t alignmentPadding(int32_t size, int32_t alignment) {
auto extra = size % alignment;
return extra == 0 ? 0 : alignment - extra;
}

FOLLY_ALWAYS_INLINE void bitsSwapByWord(uint64_t* address, int32_t bytes) {
while (bytes != 0) {
*address = __builtin_bswap64(*address);
++address;
bytes -= kAlignment;
}
}

FOLLY_ALWAYS_INLINE int
compareByWord(uint64_t* left, uint64_t* right, int32_t bytes) {
while (bytes != 0) {
if (*left == *right) {
++left;
++right;
bytes -= kAlignment;
continue;
}
if (*left > *right) {
return 1;
} else {
return -1;
}
}
return 0;
}

} // namespace

PrefixSortLayout PrefixSortLayout::makeSortLayout(
const std::vector<TypePtr>& types,
const std::vector<CompareFlags>& compareFlags,
uint32_t maxNormalizedKeySize) {
uint32_t normalizedKeySize = 0;
uint32_t numNormalizedKeys = 0;
const uint32_t numKeys = types.size();
std::vector<uint32_t> prefixOffsets;
std::vector<PrefixSortEncoder> encoders;

// Calculate encoders and prefix-offsets, and stop the loop if a key that
// cannot be normalized is encountered.
for (auto i = 0; i < numKeys; ++i) {
if (normalizedKeySize > maxNormalizedKeySize) {
break;
}
std::optional<uint32_t> encodedSize =
PrefixSortEncoder::encodedSize(types[i]->kind());
if (encodedSize.has_value()) {
prefixOffsets.push_back(normalizedKeySize);
encoders.push_back(
{compareFlags[i].ascending, compareFlags[i].nullsFirst});
normalizedKeySize += encodedSize.value();
numNormalizedKeys++;
} else {
break;
}
}
auto padding = alignmentPadding(normalizedKeySize, kAlignment);
normalizedKeySize += padding;
return PrefixSortLayout{
normalizedKeySize + sizeof(char*),
normalizedKeySize,
numNormalizedKeys,
numKeys,
compareFlags,
numNormalizedKeys == 0,
numNormalizedKeys < numKeys,
std::move(prefixOffsets),
std::move(encoders),
padding};
}

FOLLY_ALWAYS_INLINE int PrefixSort::compareAllNormalizedKeys(
char* left,
char* right) {
return compareByWord(
(uint64_t*)left, (uint64_t*)right, sortLayout_.normalizedBufferSize);
}

int PrefixSort::comparePartNormalizedKeys(char* left, char* right) {
int result = compareAllNormalizedKeys(left, right);
if (result != 0) {
return result;
}
// If prefixes are equal, compare the left sort keys with rowContainer.
char* leftAddress = getAddressFromPrefix(left);
char* rightAddress = getAddressFromPrefix(right);
for (auto i = sortLayout_.numNormalizedKeys; i < sortLayout_.numKeys; ++i) {
result = rowContainer_->compare(
leftAddress, rightAddress, i, sortLayout_.compareFlags[i]);
if (result != 0) {
return result;
}
}
return result;
}

PrefixSort::PrefixSort(
memory::MemoryPool* pool,
RowContainer* rowContainer,
const std::vector<CompareFlags>& keyCompareFlags,
const PrefixSortConfig& config,
const PrefixSortLayout& sortLayout)
: pool_(pool), sortLayout_(sortLayout), rowContainer_(rowContainer) {}

void PrefixSort::extractRowToPrefix(char* row, char* prefix) {
for (auto i = 0; i < sortLayout_.numNormalizedKeys; i++) {
extractRowColumnToPrefix(
rowContainer_->keyTypes()[i]->kind(),
sortLayout_,
i,
rowContainer_->columnAt(i),
row,
prefix);
}
simd::memset(
prefix + sortLayout_.normalizedBufferSize - sortLayout_.padding,
0,
sortLayout_.padding);
// When comparing in std::memcmp, each byte is compared. If it is changed to
// compare every 8 bytes, the number of comparisons will be reduced and the
// performance will be improved.
// Use uint64_t compare to implement the above-mentioned comparison of every 8
// bytes, assuming the system is little-endian, need to reverse bytes for
// every 8 bytes.
bitsSwapByWord((uint64_t*)prefix, sortLayout_.normalizedBufferSize);
// Set row address.
getAddressFromPrefix(prefix) = row;
}

void PrefixSort::sortInternal(std::vector<char*>& rows) {
const auto numRows = rows.size();
const auto entrySize = sortLayout_.entrySize;
memory::ContiguousAllocation prefixAllocation;
// 1. Allocate prefixes data.
{
const auto numPages =
memory::AllocationTraits::numPages(numRows * entrySize);
pool_->allocateContiguous(numPages, prefixAllocation);
}
char* const prefixes = prefixAllocation.data<char>();

// 2. Extract rows to prefixes with row address.
for (auto i = 0; i < rows.size(); ++i) {
extractRowToPrefix(rows[i], prefixes + entrySize * i);
}

// 3. Sort prefixes with row address.
{
const auto swapBuffer = AlignedBuffer::allocate<char>(entrySize, pool_);
PrefixSortRunner sortRunner(entrySize, swapBuffer->asMutable<char>());
const auto start = prefixes;
const auto end = prefixes + numRows * entrySize;
if (sortLayout_.hasNonNormalizedKey) {
sortRunner.quickSort(start, end, [&](char* a, char* b) {
return comparePartNormalizedKeys(a, b);
});
} else {
sortRunner.quickSort(start, end, [&](char* a, char* b) {
return compareAllNormalizedKeys(a, b);
});
}
}
// 4. Output sorted row addresses.
for (int i = 0; i < rows.size(); i++) {
rows[i] = getAddressFromPrefix(prefixes + i * entrySize);
}
}

} // namespace facebook::velox::exec
Loading

0 comments on commit 85f3973

Please sign in to comment.