Skip to content

Commit

Permalink
Add support to read plain encoded INT96 timestamp from Parquet file
Browse files Browse the repository at this point in the history
  • Loading branch information
mskapilks authored and rui-mo committed Sep 13, 2024
1 parent 98bbb73 commit 4a67523
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 2 deletions.
12 changes: 11 additions & 1 deletion velox/dwio/common/DirectDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,17 @@ class DirectDecoder : public IntDecoder<isSigned> {
} else if constexpr (std::is_same_v<
typename Visitor::DataType,
int128_t>) {
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
if (super::numBytes != 12) {
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
} else {
// Reads INT96 timestamp as int128_t type and extracts the days and
// nanos.
const int128_t encoded = super::template readInt<int128_t>();
const int32_t days = encoded & ((1ULL << 32) - 1);
const uint64_t nanos = static_cast<uint64_t>(encoded >> 32);
auto ts = Timestamp::fromDaysAndNanos(days, nanos);
toSkip = visitor.process(reinterpret_cast<int128_t&>(ts), atEnd);
}
} else {
toSkip = visitor.process(super::template readInt<int64_t>(), atEnd);
}
Expand Down
36 changes: 35 additions & 1 deletion velox/dwio/common/IntDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ class IntDecoder {
template <typename T>
T readInt();

// Reads Int96 timestamp composed of days and nanos as int128_t.
int128_t readInt96();

template <typename T>
T readVInt();

Expand Down Expand Up @@ -438,12 +441,43 @@ inline T IntDecoder<isSigned>::readInt() {
return readLittleEndianFromBigEndian<T>();
} else {
if constexpr (std::is_same_v<T, int128_t>) {
VELOX_NYI();
if (numBytes == 12) {
VELOX_DCHECK(!useVInts, "Int96 should not be VInt encoded.");
return readInt96();
} else {
VELOX_NYI();
}
}
return readLongLE();
}
}

template <bool isSigned>
inline int128_t IntDecoder<isSigned>::readInt96() {
int64_t offset = 0;
unsigned char ch;

// Read 8 unsigned bytes.
uint64_t part1 = 0;
for (uint32_t i = 0; i < 8; ++i) {
ch = readByte();
part1 |= (ch & BASE_256_MASK) << offset;
offset += 8;
}

// Read 4 signed bytes.
int32_t part2 = 0;
offset = 0;
for (uint32_t i = 0; i < 4; ++i) {
ch = readByte();
part2 |= (ch & BASE_256_MASK) << offset;
offset += 8;
}

int128_t result = part1;
return (result << 32) | part2;
}

template <bool isSigned>
template <typename T>
inline T IntDecoder<isSigned>::readVInt() {
Expand Down
Binary file not shown.
Binary file not shown.
14 changes: 14 additions & 0 deletions velox/dwio/parquet/tests/reader/E2EFilterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,20 @@ TEST_F(E2EFilterTest, integerDictionary) {
20);
}

TEST_F(E2EFilterTest, timestampDirect) {
options_.enableDictionary = false;
options_.dataPageSize = 4 * 1024;
options_.writeInt96AsTimestamp = true;

testWithTypes(
"timestamp_val_0:timestamp,"
"timestamp_val_1:timestamp",
[&]() {},
true,
{"timestamp_val_0", "timestamp_val_1"},
20);
}

TEST_F(E2EFilterTest, timestampDictionary) {
options_.dataPageSize = 4 * 1024;
options_.writeInt96AsTimestamp = true;
Expand Down
28 changes: 28 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,34 @@ TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) {
assertEqualResults({expected}, result.second);
}


TEST_F(ParquetTableScanTest, timestampINT96) {
auto a = makeFlatVector<Timestamp>({Timestamp(1, 0), Timestamp(2, 0)});
auto expected = makeRowVector({"time"}, {a});
createDuckDbTable("expected", {expected});

auto vector = makeArrayVector<Timestamp>({{}});
loadData(
getExampleFilePath("timestamp_dict_int96.parquet"),
ROW({"time"}, {TIMESTAMP()}),
makeRowVector(
{"time"},
{
vector,
}));
assertSelect({"time"}, "SELECT time from expected");

loadData(
getExampleFilePath("timestamp_plain_int96.parquet"),
ROW({"time"}, {TIMESTAMP()}),
makeRowVector(
{"time"},
{
vector,
}));
assertSelect({"time"}, "SELECT time from expected");
}

int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
folly::Init init{&argc, &argv, false};
Expand Down

0 comments on commit 4a67523

Please sign in to comment.