Skip to content

Commit

Permalink
Remove Parquet amalgamation (#6496)
Browse files Browse the repository at this point in the history
Summary:
Keep ParquetDecodeUtils and ByteBuffer needed by the BitPackDecoderBenchmark.cpp

Pull Request resolved: #6496

Reviewed By: Yuhta

Differential Revision: D49905219

Pulled By: mbasmanova

fbshipit-source-id: 67d9a8df637fb9678a7f5f16df4dbb705a9a4779
  • Loading branch information
majetideepak authored and facebook-github-bot committed Oct 4, 2023
1 parent 83d62a1 commit 04da9ea
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 51,625 deletions.
2 changes: 0 additions & 2 deletions velox/duckdb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@ a new feature, first clone DuckDB git repository:
Then generate the amalgamated .cpp and .hpp files:

python3 scripts/amalgamation.py --extended --splits=8
python3 scripts/parquet_amalgamation.py

Then copy the generated files to velox/external/duckdb:

export VELOX_PATH="<path/to/velox>"
rsync -vrh src/amalgamation/duckdb* ${VELOX_PATH}/velox/external/duckdb/
rsync -vrh src/amalgamation/parquet* ${VELOX_PATH}/velox/external/duckdb/

We also maintain a copy of TPC-H dataset generators that need to be updated:

Expand Down
184 changes: 182 additions & 2 deletions velox/dwio/common/tests/BitPackDecoderBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@
#endif

#include "velox/external/duckdb/duckdb-fastpforlib.hpp"
#include "velox/external/duckdb/parquet-amalgamation.hpp"
#include "velox/vector/TypeAliases.h"

#include <arrow/util/rle_encoding.h>
#include <folly/Benchmark.h>
#include <folly/Random.h>
#include <folly/init/Init.h>
#include "velox/external/duckdb/duckdb.hpp"

using namespace folly;
using namespace facebook::velox;
Expand All @@ -38,6 +37,187 @@ using RowSet = folly::Range<const facebook::velox::vector_size_t*>;

static const uint64_t kNumValues = 1024768 * 8;

namespace duckdb {

class ByteBuffer { // on to the 10 thousandth impl
public:
ByteBuffer() {}
ByteBuffer(char* ptr, uint64_t len) : ptr(ptr), len(len) {}

char* ptr = nullptr;
uint64_t len = 0;

public:
void inc(uint64_t increment) {
available(increment);
len -= increment;
ptr += increment;
}

template <class T>
T read() {
T val = get<T>();
inc(sizeof(T));
return val;
}

template <class T>
T get() {
available(sizeof(T));
T val = Load<T>((data_ptr_t)ptr);
return val;
}

void copy_to(char* dest, uint64_t len) {
available(len);
std::memcpy(dest, ptr, len);
}

void zero() {
std::memset(ptr, 0, len);
}

void available(uint64_t req_len) {
if (req_len > len) {
throw std::runtime_error("Out of buffer");
}
}
};

class ParquetDecodeUtils {
public:
template <class T>
static T ZigzagToInt(const T& n) {
return (n >> 1) ^ -(n & 1);
}

static const uint64_t BITPACK_MASKS[];
static const uint64_t BITPACK_MASKS_SIZE;
static const uint8_t BITPACK_DLEN;

template <typename T>
static uint32_t BitUnpack(
ByteBuffer& buffer,
uint8_t& bitpack_pos,
T* dest,
uint32_t count,
uint8_t width) {
if (width >= ParquetDecodeUtils::BITPACK_MASKS_SIZE) {
throw InvalidInputException(
"The width (%d) of the bitpacked data exceeds the supported max width (%d), "
"the file might be corrupted.",
width,
ParquetDecodeUtils::BITPACK_MASKS_SIZE);
}
auto mask = BITPACK_MASKS[width];

for (uint32_t i = 0; i < count; i++) {
T val = (buffer.get<uint8_t>() >> bitpack_pos) & mask;
bitpack_pos += width;
while (bitpack_pos > BITPACK_DLEN) {
buffer.inc(1);
val |= (T(buffer.get<uint8_t>())
<< T(BITPACK_DLEN - (bitpack_pos - width))) &
mask;
bitpack_pos -= BITPACK_DLEN;
}
dest[i] = val;
}
return count;
}

template <class T>
static T VarintDecode(ByteBuffer& buf) {
T result = 0;
uint8_t shift = 0;
while (true) {
auto byte = buf.read<uint8_t>();
result |= T(byte & 127) << shift;
if ((byte & 128) == 0) {
break;
}
shift += 7;
if (shift > sizeof(T) * 8) {
throw std::runtime_error("Varint-decoding found too large number");
}
}
return result;
}
};
} // namespace duckdb

const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS[] = {
0,
1,
3,
7,
15,
31,
63,
127,
255,
511,
1023,
2047,
4095,
8191,
16383,
32767,
65535,
131071,
262143,
524287,
1048575,
2097151,
4194303,
8388607,
16777215,
33554431,
67108863,
134217727,
268435455,
536870911,
1073741823,
2147483647,
4294967295,
8589934591,
17179869183,
34359738367,
68719476735,
137438953471,
274877906943,
549755813887,
1099511627775,
2199023255551,
4398046511103,
8796093022207,
17592186044415,
35184372088831,
70368744177663,
140737488355327,
281474976710655,
562949953421311,
1125899906842623,
2251799813685247,
4503599627370495,
9007199254740991,
18014398509481983,
36028797018963967,
72057594037927935,
144115188075855871,
288230376151711743,
576460752303423487,
1152921504606846975,
2305843009213693951,
4611686018427387903,
9223372036854775807,
18446744073709551615ULL};

const uint64_t duckdb::ParquetDecodeUtils::BITPACK_MASKS_SIZE =
sizeof(ParquetDecodeUtils::BITPACK_MASKS) / sizeof(uint64_t);

const uint8_t duckdb::ParquetDecodeUtils::BITPACK_DLEN = 8;

// Array of bit packed representations of randomInts_u32. The array at index i
// is packed i bits wide and the values come from the low bits of
std::vector<std::vector<uint64_t>> bitPackedData;
Expand Down
4 changes: 1 addition & 3 deletions velox/external/duckdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
add_subdirectory(tpch)

add_compile_definitions(DISABLE_DUCKDB_REMOTE_INSTALL)
add_compile_definitions(BUILD_PARQUET_EXTENSION)

add_library(
duckdb
Expand All @@ -30,8 +29,7 @@ add_library(
duckdb-libpg_query.cpp
duckdb-miniz.cpp
duckdb-re2.cpp
duckdb-utf8proc.cpp
parquet-amalgamation.cpp)
duckdb-utf8proc.cpp)

set_property(TARGET duckdb PROPERTY JOB_POOL_COMPILE high_memory_pool)

Expand Down
Loading

0 comments on commit 04da9ea

Please sign in to comment.