-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
snapshots seg compressor: pattern extractor and patricia tree (#1783)
- Loading branch information
1 parent
669f014
commit 2b8c773
Showing
25 changed files
with
4,018 additions
and
128 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
Copyright 2024 The Silkworm Authors | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#if (defined(_MSC_VER) && _MSC_VER >= 1928) | ||
#include <intrin.h> | ||
inline int __builtin_clz(unsigned int x) { | ||
unsigned long index; | ||
return int(_BitScanReverse(&index, (unsigned long)x) ? 31 - index : 32); | ||
} | ||
inline int __builtin_clzl(unsigned long x) { | ||
return __builtin_clz((unsigned int)x); | ||
} | ||
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) | ||
inline int __builtin_clzll(unsigned long long x) { | ||
if (x == 0) { | ||
return 64; | ||
} | ||
unsigned int msb = (unsigned int)(x >> 32); | ||
unsigned int lsb = (unsigned int)x; | ||
return (msb != 0) ? __builtin_clz(msb) : 32 + __builtin_clz(lsb); | ||
} | ||
#else | ||
inline int __builtin_clzll(unsigned long long x) { | ||
unsigned long index; | ||
return int(_BitScanReverse64(&index, x) ? 63 - index : 64); | ||
} | ||
#endif | ||
inline int __builtin_ctz(unsigned int x) { | ||
unsigned long index; | ||
return int(_BitScanForward(&index, (unsigned long)x) ? index : 32); | ||
} | ||
inline int __builtin_ctzl(unsigned long x) { | ||
return __builtin_ctz((unsigned int)x); | ||
} | ||
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) | ||
inline int __builtin_ctzll(unsigned long long x) { | ||
unsigned long index; | ||
unsigned int msb = (unsigned int)(x >> 32); | ||
unsigned int lsb = (unsigned int)x; | ||
if (lsb != 0) { | ||
return (int)(_BitScanForward(&index, lsb) ? index : 64); | ||
} else { | ||
return (int)(_BitScanForward(&index, msb) ? index + 32 : 64); | ||
} | ||
} | ||
#else | ||
inline int __builtin_ctzll(unsigned long long x) { | ||
unsigned long index; | ||
return int(_BitScanForward64(&index, x) ? index : 64); | ||
} | ||
#endif | ||
|
||
inline int __builtin_ffs(int x) { | ||
unsigned long index; | ||
return int(_BitScanForward(&index, (unsigned long)x) ? index + 1 : 0); | ||
} | ||
inline int __builtin_ffsl(long x) { | ||
return __builtin_ffs(int(x)); | ||
} | ||
#if defined(_M_IX86) || defined(_M_ARM) || defined(_M_ARM64) | ||
inline int __builtin_ffsll(long long x) { | ||
int ctzll = __builtin_ctzll((unsigned long long)x); | ||
return ctzll != 64 ? ctzll + 1 : 0; | ||
} | ||
#else | ||
inline int __builtin_ffsll(long long x) { | ||
unsigned long index; | ||
return int(_BitScanForward64(&index, (unsigned long long)x) ? index + 1 : 0); | ||
} | ||
inline int __builtin_popcount(unsigned int x) { | ||
return int(__popcnt(x)); | ||
} | ||
|
||
inline int __builtin_popcountl(unsigned long x) { | ||
static_assert(sizeof(x) == 4, ""); | ||
return int(__popcnt(x)); | ||
} | ||
#endif | ||
|
||
#if defined(_M_IX86) | ||
inline int __builtin_popcountll(unsigned long long x) { | ||
return int(__popcnt((unsigned int)(x >> 32))) + | ||
int(__popcnt((unsigned int)x)); | ||
} | ||
#elif defined(_M_X64) | ||
inline int __builtin_popcountll(unsigned long long x) { | ||
return int(__popcnt64(x)); | ||
} | ||
#endif | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* | ||
Copyright 2024 The Silkworm Authors | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
#include "lcp_kasai.hpp" | ||
|
||
namespace silkworm::snapshots::seg { | ||
|
||
void lcp_kasai(const uint8_t* data, const int* sa, const int* inv, int* lcp, int n) { | ||
struct DataPosComparator { | ||
const uint8_t* data; | ||
inline bool has_same_chars(int i, int j) const { | ||
return data[i] == data[j]; | ||
} | ||
} comparator{data}; | ||
|
||
lcp_kasai<DataPosComparator>(comparator, sa, inv, lcp, n); | ||
} | ||
|
||
} // namespace silkworm::snapshots::seg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
Copyright 2024 The Silkworm Authors | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <concepts> | ||
#include <cstdint> | ||
|
||
namespace silkworm::snapshots::seg { | ||
|
||
template <class TDataPosComparator> | ||
requires requires(const TDataPosComparator& data, int i, int j) { | ||
{ data.has_same_chars(i, j) } -> std::same_as<bool>; | ||
} | ||
void lcp_kasai(const TDataPosComparator& data, const int* sa, const int* inv, int* lcp, int n) { | ||
int k = 0; | ||
|
||
// Process all suffixes one by one starting from | ||
// first suffix in txt[] | ||
for (int i = 0; i < n; i++) { | ||
// If the current suffix is at n-1, then we don’t | ||
// have next substring to consider. So lcp is not | ||
// defined for this substring, we put zero. | ||
if (inv[i] == n - 1) { | ||
k = 0; | ||
continue; | ||
} | ||
|
||
// j contains index of the next substring to | ||
// be considered to compare with the present | ||
// substring, i.e. next string in suffix array. | ||
int j = sa[inv[i] + 1]; | ||
|
||
// Directly start matching from k-th index as | ||
// at-least k-1 characters will match. | ||
while ((i + k < n) && (j + k < n) && data.has_same_chars(i + k, j + k)) { | ||
k++; | ||
} | ||
|
||
// lcp for the present suffix. | ||
lcp[inv[i]] = k; | ||
|
||
// Deleting the starting character from the string. | ||
if (k > 0) { | ||
k--; | ||
} | ||
} | ||
} | ||
|
||
void lcp_kasai(const uint8_t* data, const int* sa, const int* inv, int* lcp, int n); | ||
|
||
} // namespace silkworm::snapshots::seg |
Oops, something went wrong.