Skip to content

Commit

Permalink
fix: Optimize json_parse
Browse files Browse the repository at this point in the history
Differential Revision: D67538322
  • Loading branch information
Yuhta authored and facebook-github-bot committed Dec 21, 2024
1 parent 9265fbf commit 9d6f6ea
Show file tree
Hide file tree
Showing 9 changed files with 575 additions and 294 deletions.
161 changes: 161 additions & 0 deletions velox/common/base/SortingNetwork.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/common/base/Exceptions.h"

namespace facebook::velox {

constexpr int kSortingNetworkMaxSize = 32;

template <typename T, typename LessThan>
void sortingNetwork(T* data, int size, LessThan&& lt);

namespace detail {

// Compile time generated Bose-Nelson sorting network.
//
// https://bertdobbelaere.github.io/sorting_networks.html
// https://github.com/Vectorized/Static-Sort/blob/master/include/static_sort.h
template <int kSize>
class SortingNetworkImpl {
public:
template <typename T, typename LessThan>
static void apply(T* data, LessThan&& lt) {
PS<T, LessThan, 1, kSize, (kSize <= 1)> ps(data, lt);
}

private:
template <int I, int J, typename T, typename LessThan>
static void compareExchange(T* data, LessThan lt) {
// This is branchless if `lt' is branchless.
auto c = lt(data[I], data[J]);
auto min = c ? data[I] : data[J];
data[J] = c ? data[J] : data[I];
data[I] = min;
}

template <typename T, typename LessThan, int I, int J, int X, int Y>
struct PB {
PB(T* data, LessThan lt) {
enum {
L = X >> 1,
M = (X & 1 ? Y : Y + 1) >> 1,
IAddL = I + L,
XSubL = X - L,
};
PB<T, LessThan, I, J, L, M> p0(data, lt);
PB<T, LessThan, IAddL, J + M, XSubL, Y - M> p1(data, lt);
PB<T, LessThan, IAddL, J, XSubL, M> p2(data, lt);
}
};

template <typename T, typename LessThan, int I, int J>
struct PB<T, LessThan, I, J, 1, 1> {
PB(T* data, LessThan lt) {
compareExchange<I - 1, J - 1>(data, lt);
}
};

template <typename T, typename LessThan, int I, int J>
struct PB<T, LessThan, I, J, 1, 2> {
PB(T* data, LessThan lt) {
compareExchange<I - 1, J>(data, lt);
compareExchange<I - 1, J - 1>(data, lt);
}
};

template <typename T, typename LessThan, int I, int J>
struct PB<T, LessThan, I, J, 2, 1> {
PB(T* data, LessThan lt) {
compareExchange<I - 1, J - 1>(data, lt);
compareExchange<I, J - 1>(data, lt);
}
};

template <typename T, typename LessThan, int I, int M, bool kStop>
struct PS {
PS(T* data, LessThan lt) {
enum { L = M >> 1, IAddL = I + L, MSubL = M - L };
PS<T, LessThan, I, L, (L <= 1)> ps0(data, lt);
PS<T, LessThan, IAddL, MSubL, (MSubL <= 1)> ps1(data, lt);
PB<T, LessThan, I, IAddL, L, MSubL> pb(data, lt);
}
};

template <typename T, typename LessThan, int I, int M>
struct PS<T, LessThan, I, M, true> {
PS(T* /*data*/, LessThan /*lt*/) {}
};
};

} // namespace detail

template <typename T, typename LessThan>
void sortingNetwork(T* data, int size, LessThan&& lt) {
switch (size) {
case 0:
case 1:
return;

#ifdef VELOX_SORTING_NETWORK_IMPL_APPLY_CASE
#error "Macro name clash: VELOX_SORTING_NETWORK_IMPL_APPLY_CASE"
#endif
#define VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(_n) \
case _n: \
detail::SortingNetworkImpl<_n>::apply(data, std::forward<LessThan>(lt)); \
return;

VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(2)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(3)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(4)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(5)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(6)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(7)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(8)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(9)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(10)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(11)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(12)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(13)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(14)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(15)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(16)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(17)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(18)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(19)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(20)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(21)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(22)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(23)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(24)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(25)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(26)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(27)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(28)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(29)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(30)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(31)
VELOX_SORTING_NETWORK_IMPL_APPLY_CASE(32)

#undef VELOX_SORTING_NETWORK_IMPL_APPLY_CASE

default:
VELOX_UNREACHABLE();
}
}

} // namespace facebook::velox
27 changes: 2 additions & 25 deletions velox/functions/lib/Utf8Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,26 +173,6 @@ tryGetUtf8CharLength(const char* input, int64_t size, int32_t& codePoint) {
return -1;
}

bool hasInvalidUTF8(const char* input, int32_t len) {
for (size_t inputIndex = 0; inputIndex < len;) {
if (IS_ASCII(input[inputIndex])) {
// Ascii
inputIndex++;
} else {
// Unicode
int32_t codePoint;
auto charLength =
tryGetUtf8CharLength(input + inputIndex, len - inputIndex, codePoint);
if (charLength < 0) {
return true;
}
inputIndex += charLength;
}
}

return false;
}

size_t replaceInvalidUTF8Characters(
char* outputBuffer,
const char* input,
Expand All @@ -213,12 +193,9 @@ size_t replaceInvalidUTF8Characters(
outputIndex += charLength;
inputIndex += charLength;
} else {
size_t replaceCharactersToWriteOut = inputIndex < len - 1 &&
isMultipleInvalidSequences(input, inputIndex)
? -charLength
: 1;
const auto& replacementCharacterString =
kReplacementCharacterStrings[replaceCharactersToWriteOut - 1];
getInvalidUTF8ReplacementString(
input + inputIndex, len - inputIndex, -charLength);
std::memcpy(
outputBuffer + outputIndex,
replacementCharacterString.data(),
Expand Down
8 changes: 6 additions & 2 deletions velox/functions/lib/Utf8Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,12 @@ FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Returns true only if invalid UTF-8 is present in the input string.
bool hasInvalidUTF8(const char* input, int32_t len);
inline const std::string_view&
getInvalidUTF8ReplacementString(const char* input, int len, int codePointSize) {
auto index =
len >= 2 && isMultipleInvalidSequences(input, 0) ? codePointSize - 1 : 0;
return kReplacementCharacterStrings[index];
}

/// Replaces invalid UTF-8 characters with replacement characters similar to
/// that produced by Presto java. The function requires that output have
Expand Down
11 changes: 0 additions & 11 deletions velox/functions/lib/tests/Utf8Test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,17 +104,6 @@ TEST(Utf8Test, tryCharLength) {
ASSERT_EQ(-1, tryCharLength({0xBF}));
}

TEST(UTF8Test, validUtf8) {
auto tryHasInvalidUTF8 = [](const std::vector<unsigned char>& bytes) {
return hasInvalidUTF8(
reinterpret_cast<const char*>(bytes.data()), bytes.size());
};

ASSERT_FALSE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A}));
ASSERT_TRUE(tryHasInvalidUTF8({0x5c, 0x19, 0x7A, 0xBF}));
ASSERT_TRUE(tryHasInvalidUTF8({0x64, 0x65, 0x1A, 0b11100000, 0x81, 0xBF}));
}

TEST(UTF8Test, replaceInvalidUTF8Characters) {
auto testReplaceInvalidUTF8Chars = [](const std::string& input,
const std::string& expected) {
Expand Down
Loading

0 comments on commit 9d6f6ea

Please sign in to comment.