Skip to content

Commit

Permalink
Add SIMD linear search for StringView (#6970)
Browse files Browse the repository at this point in the history
Summary:
Add SIMD linear search for StringView

Add a linear search method for StringView. Useful for linear search
for string map keys.  This is ~2x faster than the corresponding scalar
search.

Pull Request resolved: #6970

Reviewed By: Yuhta

Differential Revision: D50104889

Pulled By: oerling

fbshipit-source-id: 1f49c19b1e1caf8d8081e67091dd2e2161f74924
  • Loading branch information
Orri Erling authored and facebook-github-bot committed Oct 11, 2023
1 parent 40224fe commit 3aa3e94
Show file tree
Hide file tree
Showing 4 changed files with 280 additions and 0 deletions.
1 change: 1 addition & 0 deletions velox/type/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_library(
DoubleUtil.cpp
Filter.cpp
HugeInt.cpp
StringView.cpp
StringView.h
Subfield.cpp
Timestamp.cpp
Expand Down
137 changes: 137 additions & 0 deletions velox/type/StringView.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/type/StringView.h"
#include "velox/common/base/SimdUtil.h"

namespace facebook::velox {

namespace {
int32_t linearSearchSimple(
StringView key,
const StringView* strings,
const int32_t* indices,
int32_t numStrings) {
if (indices) {
for (auto i = 0; i < numStrings; ++i) {
if (strings[indices[i]] == key) {
return i;
}
}
} else {
for (auto i = 0; i < numStrings; ++i) {
if (strings[i] == key) {
return i;
}
}
}
return -1;
}

} // namespace

// static
int32_t StringView::linearSearch(
StringView key,
const StringView* strings,
const int32_t* indices,
int32_t numStrings) {
#if XSIMD_WITH_AVX2
constexpr int64_t kBatch = xsimd::batch<uint64_t>::size;
bool isInline = key.isInline();
bool headOnly = key.size() <= 4;
const char* body = key.data() + 4;
int32_t bodySize = key.size() - 4;
int32_t limit = numStrings & ~(kBatch - 1); // round down to full batches.
if (indices) {
uint64_t head = *reinterpret_cast<const uint64_t*>(&key);
uint64_t inlined = reinterpret_cast<const uint64_t*>(&key)[1];
xsimd::batch<int32_t, xsimd::sse2> indexVector;

for (auto i = 0; i < limit; i += kBatch) {
indexVector = simd::loadGatherIndices<uint64_t, int32_t>(indices + i)
<< 1;
auto heads =
simd::gather(reinterpret_cast<const uint64_t*>(strings), indexVector);
uint16_t hits = simd::toBitMask(heads == head);
if (LIKELY(!hits)) {
continue;
}
if (headOnly) {
return i + __builtin_ctz(hits);
}
while (hits) {
auto offset = bits::getAndClearLastSetBit(hits);
if (isInline ? inlined ==
reinterpret_cast<const uint64_t*>(
&strings[indices[i + offset]])[1]
: simd::memEqualUnsafe(
body,
strings[indices[i + offset]].data() + 4,
bodySize)) {
return i + offset;
}
}
}
return linearSearchSimple(
key, strings, indices + limit, numStrings - limit);
} else {
StringView key2[2];
memcpy(&key2[0], &key, sizeof(key));
memcpy(&key2[1], &key, sizeof(key));
auto keyVector = xsimd::load_unaligned(reinterpret_cast<uint64_t*>(&key2));
for (auto i = 0; i < limit; i += kBatch, strings += kBatch) {
// Compare 4 StringViews in 2 loads of 2 each.
int32_t bits =
simd::toBitMask(
xsimd::load_unaligned(
reinterpret_cast<const uint64_t*>(strings)) == keyVector) |
(simd::toBitMask(
xsimd::load_unaligned(
reinterpret_cast<const uint64_t*>(strings + 2)) == keyVector)
<< 4);

if ((bits & (1 + 4 + 16 + 64)) == 0) {
// Neither lane 0 or 2 or 4 or 6 hits
continue;
}
int offset = i;
while (bits) {
auto low = bits & 3;
// At least first lane must match.
if (low & 1) {
// Both first and second lane match or only first word counts.
if (low == 3 || headOnly) {
return offset;
}
if (!isInline) {
if (simd::memEqualUnsafe(
body, strings[offset].data() + 4, bodySize)) {
return offset;
}
}
}
bits = bits >> 2;
++offset;
}
}
return linearSearchSimple(key, strings, nullptr, numStrings - limit);
}
#else
return linearSearchSimple(key, strings, indices, numStrings);
#endif
}
} // namespace facebook::velox
13 changes: 13 additions & 0 deletions velox/type/StringView.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,19 @@ struct StringView {
return size() == 0;
}

/// Searches for 'key == strings[i]'for i >= 0 < numStrings. If
/// 'indices' is given. searches for 'key ==
/// strings[indices[i]]. Returns the first i for which the strings
/// match or -1 if no match is found. Uses SIMD to accelerate the
/// search. Accesses StringView bodies in 32 byte vectors, thus
/// expects up to 31 bytes of addressable padding after out of
/// line strings. This is the case for velox Buffers.
static int32_t linearSearch(
StringView key,
const StringView* strings,
const int32_t* indices,
int32_t numStrings);

private:
inline int64_t sizeAndPrefixAsInt64() const {
return reinterpret_cast<const int64_t*>(this)[0];
Expand Down
129 changes: 129 additions & 0 deletions velox/type/tests/StringViewTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <sstream>
#include "velox/common/base/RawVector.h"
#include "velox/common/base/SimdUtil.h"
#include "velox/common/time/Timer.h"
#include "velox/type/Type.h"

using namespace facebook::velox;
Expand Down Expand Up @@ -156,3 +159,129 @@ TEST(StringView, negativeSizes) {
EXPECT_THROW(StringView("abc", -10), VeloxException);
EXPECT_NO_THROW(StringView(nullptr, 0));
}

int32_t linearSearchSimple(
StringView key,
const StringView* strings,
const int32_t* indices,
int32_t numStrings) {
if (indices) {
for (auto i = 0; i < numStrings; ++i) {
if (strings[indices[i]] == key) {
return i;
}
}
} else {
for (auto i = 0; i < numStrings; ++i) {
if (strings[i] == key) {
return i;
}
}
}
return -1;
}

TEST(StringView, linearSearch) {
constexpr int32_t kSize = 1003;
std::vector<raw_vector<char>> data(kSize);
std::vector<StringView> stringViews(kSize);
// Distinct values with sizes from 0 to 50.
for (auto i = 0; i < 1000; ++i) {
std::string string = fmt::format("{}-", i);
int32_t numRepeats = 1 + i % 10;
auto item = string;
for (auto repeat = 0; repeat < numRepeats; ++repeat) {
string += item;
}
string.resize(std::min<int32_t>((i >= 50 ? 2 : 0) + string.size(), i % 50));
data[i].resize(string.size());
if (!string.empty()) {
memcpy(data[i].data(), string.data(), string.size());
}
stringViews[i] = StringView(data[i].data(), data[i].size());
}
raw_vector<int32_t> indices(kSize);
for (auto i = 0; i < kSize; ++i) {
indices[i] = 999 - i;
}

uint64_t simdUsec = 0;
uint64_t loopUsec = 0;
uint64_t simdIndicesUsec = 0;
uint64_t loopIndicesUsec = 0;
for (auto counter = 0; counter < 10; ++counter) {
{
MicrosecondTimer t(&simdUsec);
EXPECT_EQ(
-1,
StringView::linearSearch(
stringViews[11], stringViews.data(), nullptr, 10));
for (auto i = 0; i < kSize; ++i) {
auto testIndex = (i * 1) % kSize;
auto index = StringView::linearSearch(
stringViews[testIndex],
stringViews.data(),
nullptr,
stringViews.size());
EXPECT_TRUE(stringViews[testIndex] == stringViews[index]);
}
}
{
MicrosecondTimer t(&loopUsec);
EXPECT_EQ(
-1,
linearSearchSimple(stringViews[11], stringViews.data(), nullptr, 10));
for (auto i = 0; i < kSize; ++i) {
auto testIndex = (i * 1) % kSize;
auto index = linearSearchSimple(
stringViews[testIndex],
stringViews.data(),
nullptr,
stringViews.size());
EXPECT_TRUE(stringViews[testIndex] == stringViews[index]);
}
}

{
MicrosecondTimer t(&simdIndicesUsec);
EXPECT_EQ(
-1,
StringView::linearSearch(
stringViews[indices[11]],
stringViews.data(),
indices.data(),
10));
for (auto i = 0; i < kSize; ++i) {
auto testIndex = (i * 1) % kSize;
auto index = StringView::linearSearch(
stringViews[testIndex],
stringViews.data(),
indices.data(),
stringViews.size());
EXPECT_TRUE(stringViews[testIndex] == stringViews[indices[index]]);
}
}
{
MicrosecondTimer t(&loopIndicesUsec);
EXPECT_EQ(
-1,
linearSearchSimple(
stringViews[indices[11]],
stringViews.data(),
indices.data(),
10));
for (auto i = 0; i < kSize; ++i) {
auto testIndex = (i * 1) % kSize;
auto index = linearSearchSimple(
stringViews[testIndex],
stringViews.data(),
indices.data(),
stringViews.size());
EXPECT_TRUE(stringViews[testIndex] == stringViews[indices[index]]);
}
}
}
LOG(INFO) << "StringView search: SIMD: " << simdUsec << " / "
<< simdIndicesUsec << " scalar: " << loopUsec << " / "
<< loopIndicesUsec;
}

0 comments on commit 3aa3e94

Please sign in to comment.