Skip to content

Commit

Permalink
snapshots seg compressor: pattern extractor and patricia tree (#1783)
Browse files Browse the repository at this point in the history
  • Loading branch information
battlmonstr authored Feb 1, 2024
1 parent 669f014 commit 2b8c773
Show file tree
Hide file tree
Showing 25 changed files with 4,018 additions and 128 deletions.
1 change: 1 addition & 0 deletions silkworm/node/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ set(SILKWORM_NODE_PRIVATE_LIBS
cborcpp
evmone
magic_enum::magic_enum
sais_lite
silkworm_interfaces
)
# cmake-format: on
Expand Down
105 changes: 105 additions & 0 deletions silkworm/node/common/bit_count.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
Copyright 2024 The Silkworm Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#pragma once

#if (defined(_MSC_VER) && _MSC_VER >= 1928)
#include <intrin.h>
inline int __builtin_clz(unsigned int x) {
unsigned long index;
return int(_BitScanReverse(&index, (unsigned long)x) ? 31 - index : 32);
}
inline int __builtin_clzl(unsigned long x) {
return __builtin_clz((unsigned int)x);
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_clzll(unsigned long long x) {
if (x == 0) {
return 64;
}
unsigned int msb = (unsigned int)(x >> 32);
unsigned int lsb = (unsigned int)x;
return (msb != 0) ? __builtin_clz(msb) : 32 + __builtin_clz(lsb);
}
#else
inline int __builtin_clzll(unsigned long long x) {
unsigned long index;
return int(_BitScanReverse64(&index, x) ? 63 - index : 64);
}
#endif
inline int __builtin_ctz(unsigned int x) {
unsigned long index;
return int(_BitScanForward(&index, (unsigned long)x) ? index : 32);
}
inline int __builtin_ctzl(unsigned long x) {
return __builtin_ctz((unsigned int)x);
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_ctzll(unsigned long long x) {
unsigned long index;
unsigned int msb = (unsigned int)(x >> 32);
unsigned int lsb = (unsigned int)x;
if (lsb != 0) {
return (int)(_BitScanForward(&index, lsb) ? index : 64);
} else {
return (int)(_BitScanForward(&index, msb) ? index + 32 : 64);
}
}
#else
inline int __builtin_ctzll(unsigned long long x) {
unsigned long index;
return int(_BitScanForward64(&index, x) ? index : 64);
}
#endif

inline int __builtin_ffs(int x) {
unsigned long index;
return int(_BitScanForward(&index, (unsigned long)x) ? index + 1 : 0);
}
inline int __builtin_ffsl(long x) {
return __builtin_ffs(int(x));
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_ffsll(long long x) {
int ctzll = __builtin_ctzll((unsigned long long)x);
return ctzll != 64 ? ctzll + 1 : 0;
}
#else
inline int __builtin_ffsll(long long x) {
unsigned long index;
return int(_BitScanForward64(&index, (unsigned long long)x) ? index + 1 : 0);
}
inline int __builtin_popcount(unsigned int x) {
return int(__popcnt(x));
}

inline int __builtin_popcountl(unsigned long x) {
static_assert(sizeof(x) == 4, "");
return int(__popcnt(x));
}
#endif

#if defined(_M_IX86)
inline int __builtin_popcountll(unsigned long long x) {
return int(__popcnt((unsigned int)(x >> 32))) +
int(__popcnt((unsigned int)x));
}
#elif defined(_M_X64)
inline int __builtin_popcountll(unsigned long long x) {
return int(__popcnt64(x));
}
#endif
#endif
10 changes: 5 additions & 5 deletions silkworm/node/snapshots/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ using db::etl::kOptimalBufferSize;
void Index::build() {
SILK_TRACE << "Index::build path: " << segment_path_.path().string() << " start";

huffman::Decompressor decoder{segment_path_.path(), segment_region_};
seg::Decompressor decoder{segment_path_.path(), segment_region_};
decoder.open();

const SnapshotPath index_file = segment_path_.index_file();
Expand All @@ -57,7 +57,7 @@ void Index::build() {
do {
iterations++;
SILK_TRACE << "Process snapshot items to prepare index build for: " << segment_path_.path().string();
const bool read_ok = decoder.read_ahead([&](huffman::Decompressor::Iterator it) {
const bool read_ok = decoder.read_ahead([&](seg::Decompressor::Iterator it) {
Bytes word{};
word.reserve(kPageSize);
uint64_t i{0}, offset{0};
Expand Down Expand Up @@ -117,7 +117,7 @@ void TransactionIndex::build() {
const auto [first_tx_id, expected_tx_count] = bodies_snapshot.compute_txs_amount();
SILK_TRACE << "TransactionIndex::build first_tx_id: " << first_tx_id << " expected_tx_count: " << expected_tx_count;

huffman::Decompressor txs_decoder{segment_path_.path(), segment_region_};
seg::Decompressor txs_decoder{segment_path_.path(), segment_region_};
txs_decoder.open();

const auto tx_count = txs_decoder.words_count();
Expand Down Expand Up @@ -148,10 +148,10 @@ void TransactionIndex::build() {
.double_enum_index = false};
RecSplit8 tx_hash_to_block_rs{tx_hash_to_block_rs_settings, rec_split::seq_build_strategy(kOptimalBufferSize / 2)};

huffman::Decompressor bodies_decoder{bodies_segment_path.path()};
seg::Decompressor bodies_decoder{bodies_segment_path.path()};
bodies_decoder.open();

using DoubleReadAheadFunc = std::function<bool(huffman::Decompressor::Iterator, huffman::Decompressor::Iterator)>;
using DoubleReadAheadFunc = std::function<bool(seg::Decompressor::Iterator, seg::Decompressor::Iterator)>;
auto double_read_ahead = [&txs_decoder, &bodies_decoder](const DoubleReadAheadFunc& fn) -> bool {
return txs_decoder.read_ahead([fn, &bodies_decoder](auto tx_it) -> bool {
return bodies_decoder.read_ahead([fn, &tx_it](auto body_it) {
Expand Down
2 changes: 1 addition & 1 deletion silkworm/node/snapshots/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
#include <memory>
#include <utility>

#include <silkworm/node/snapshots/huffman/decompressor.hpp>
#include <silkworm/node/snapshots/path.hpp>
#include <silkworm/node/snapshots/rec_split/rec_split.hpp>
#include <silkworm/node/snapshots/seg/decompressor.hpp>

namespace silkworm::snapshots {

Expand Down
89 changes: 1 addition & 88 deletions silkworm/node/snapshots/rec_split/common/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,94 +48,6 @@
#include <x86intrin.h>
#endif

#if (defined(_MSC_VER) && _MSC_VER >= 1928)
#include <intrin.h>
inline int __builtin_clz(unsigned int x) {
unsigned long index;
return int(_BitScanReverse(&index, (unsigned long)x) ? 31 - index : 32);
}
inline int __builtin_clzl(unsigned long x) {
return __builtin_clz((unsigned int)x);
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_clzll(unsigned long long x) {
if (x == 0) {
return 64;
}
unsigned int msb = (unsigned int)(x >> 32);
unsigned int lsb = (unsigned int)x;
return (msb != 0) ? __builtin_clz(msb) : 32 + __builtin_clz(lsb);
}
#else
inline int __builtin_clzll(unsigned long long x) {
unsigned long index;
return int(_BitScanReverse64(&index, x) ? 63 - index : 64);
}
#endif
inline int __builtin_ctz(unsigned int x) {
unsigned long index;
return int(_BitScanForward(&index, (unsigned long)x) ? index : 32);
}
inline int __builtin_ctzl(unsigned long x) {
return __builtin_ctz((unsigned int)x);
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_ctzll(unsigned long long x) {
unsigned long index;
unsigned int msb = (unsigned int)(x >> 32);
unsigned int lsb = (unsigned int)x;
if (lsb != 0) {
return (int)(_BitScanForward(&index, lsb) ? index : 64);
} else {
return (int)(_BitScanForward(&index, msb) ? index + 32 : 64);
}
}
#else
inline int __builtin_ctzll(unsigned long long x) {
unsigned long index;
return int(_BitScanForward64(&index, x) ? index : 64);
}
#endif

inline int __builtin_ffs(int x) {
unsigned long index;
return int(_BitScanForward(&index, (unsigned long)x) ? index + 1 : 0);
}
inline int __builtin_ffsl(long x) {
return __builtin_ffs(int(x));
}
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64)
inline int __builtin_ffsll(long long x) {
int ctzll = __builtin_ctzll((unsigned long long)x);
return ctzll != 64 ? ctzll + 1 : 0;
}
#else
inline int __builtin_ffsll(long long x) {
unsigned long index;
return int(_BitScanForward64(&index, (unsigned long long)x) ? index + 1 : 0);
}
inline int __builtin_popcount(unsigned int x) {
return int(__popcnt(x));
}

inline int __builtin_popcountl(unsigned long x) {
static_assert(sizeof(x) == 4, "");
return int(__popcnt(x));
}
#endif

#if defined(_M_IX86)
inline int __builtin_popcountll(unsigned long long x) {
return int(__popcnt((unsigned int)(x >> 32))) +
int(__popcnt((unsigned int)x));
}
#elif defined(_M_X64)
inline int __builtin_popcountll(unsigned long long x) {
return int(__popcnt64(x));
}
#endif
#endif

#include <algorithm>
#include <cassert>
#include <cinttypes>
Expand All @@ -144,6 +56,7 @@ inline int __builtin_popcountll(unsigned long long x) {
#include <memory>

#include <silkworm/core/common/assert.hpp>
#include <silkworm/node/common/bit_count.hpp>

// Explicit branch predictions
#define likely(x) __builtin_expect(!!(x), 1)
Expand Down
32 changes: 32 additions & 0 deletions silkworm/node/snapshots/seg/compressor/lcp_kasai.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
Copyright 2024 The Silkworm Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#include "lcp_kasai.hpp"

namespace silkworm::snapshots::seg {

void lcp_kasai(const uint8_t* data, const int* sa, const int* inv, int* lcp, int n) {
struct DataPosComparator {
const uint8_t* data;
inline bool has_same_chars(int i, int j) const {
return data[i] == data[j];
}
} comparator{data};

lcp_kasai<DataPosComparator>(comparator, sa, inv, lcp, n);
}

} // namespace silkworm::snapshots::seg
65 changes: 65 additions & 0 deletions silkworm/node/snapshots/seg/compressor/lcp_kasai.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
Copyright 2024 The Silkworm Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#pragma once

#include <concepts>
#include <cstdint>

namespace silkworm::snapshots::seg {

template <class TDataPosComparator>
requires requires(const TDataPosComparator& data, int i, int j) {
{ data.has_same_chars(i, j) } -> std::same_as<bool>;
}
void lcp_kasai(const TDataPosComparator& data, const int* sa, const int* inv, int* lcp, int n) {
int k = 0;

// Process all suffixes one by one starting from
// first suffix in txt[]
for (int i = 0; i < n; i++) {
// If the current suffix is at n-1, then we don’t
// have next substring to consider. So lcp is not
// defined for this substring, we put zero.
if (inv[i] == n - 1) {
k = 0;
continue;
}

// j contains index of the next substring to
// be considered to compare with the present
// substring, i.e. next string in suffix array.
int j = sa[inv[i] + 1];

// Directly start matching from k-th index as
// at-least k-1 characters will match.
while ((i + k < n) && (j + k < n) && data.has_same_chars(i + k, j + k)) {
k++;
}

// lcp for the present suffix.
lcp[inv[i]] = k;

// Deleting the starting character from the string.
if (k > 0) {
k--;
}
}
}

void lcp_kasai(const uint8_t* data, const int* sa, const int* inv, int* lcp, int n);

} // namespace silkworm::snapshots::seg
Loading

0 comments on commit 2b8c773

Please sign in to comment.