Skip to content

Commit

Permalink
refactor(parquet): Move arrow levelComparison to common (#11711)
Browse files Browse the repository at this point in the history
Summary:
Both Parquet reader and writer depend on levelComparison.
This currently resides in parquet/writer/arrow which introduces a dependency of the writer to the reader.
Refactor to parquet common along with renaming variables to Velox coding convention.

Fixes: #11678

Pull Request resolved: #11711

Reviewed By: mbasmanova

Differential Revision: D66792888

Pulled By: bikramSingh91

fbshipit-source-id: 88b2cde1eb652762dcf5abc38508fa202aa143b1
  • Loading branch information
majetideepak authored and facebook-github-bot committed Dec 6, 2024
1 parent 3ead2f4 commit e983aac
Show file tree
Hide file tree
Showing 34 changed files with 656 additions and 854 deletions.
2 changes: 1 addition & 1 deletion velox/dwio/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
# limitations under the License.

if(VELOX_ENABLE_PARQUET)
add_subdirectory(thrift)
add_subdirectory(common)
add_subdirectory(reader)
add_subdirectory(thrift)
add_subdirectory(writer)

if(${VELOX_BUILD_TESTING})
Expand Down
1 change: 1 addition & 0 deletions velox/dwio/parquet/common/BloomFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "velox/dwio/parquet/common/BloomFilter.h"
#include "velox/dwio/parquet/common/XxHasher.h"
#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h"
#include "velox/dwio/parquet/thrift/ThriftTransport.h"

#include <thrift/protocol/TCompactProtocol.h>
Expand Down
1 change: 0 additions & 1 deletion velox/dwio/parquet/common/BloomFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "velox/dwio/common/BufferedInput.h"
#include "velox/dwio/common/OutputStream.h"
#include "velox/dwio/parquet/common/Hasher.h"
#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h"

#include <cmath>
#include <cstdint>
Expand Down
14 changes: 10 additions & 4 deletions velox/dwio/parquet/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_library(velox_dwio_native_parquet_common BloomFilter.cpp XxHasher.cpp)
velox_add_library(
velox_dwio_parquet_common
BloomFilter.cpp
XxHasher.cpp
LevelComparison.cpp
LevelConversion.cpp)

target_link_libraries(
velox_dwio_native_parquet_common
velox_link_libraries(
velox_dwio_parquet_common
velox_dwio_parquet_thrift
velox_type
velox_dwio_common
velox_dwio_common_compression
fmt::fmt
arrow
fmt::fmt
Folly::folly
Snappy::snappy
thrift
zstd::zstd)
56 changes: 56 additions & 0 deletions velox/dwio/parquet/common/LevelComparison.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// Adapted from Apache Arrow.

#include "velox/dwio/parquet/common/LevelComparison.h"

#include <limits>

#include "folly/lang/Bits.h"

namespace facebook::velox::parquet {
namespace {
template <typename Predicate>
inline uint64_t
LevelsToBitmap(const int16_t* levels, int64_t numLevels, Predicate predicate) {
// Both clang and GCC can vectorize this automatically with SSE4/AVX2.
uint64_t mask = 0;
for (int x = 0; x < numLevels; x++) {
mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
}
return folly::Endian::little(mask);
}

} // namespace

uint64_t
GreaterThanBitmap(const int16_t* levels, int64_t numLevels, int16_t rhs) {
return LevelsToBitmap(
levels, numLevels, [rhs](int16_t value) { return value > rhs; });
}

MinMax FindMinMax(const int16_t* levels, int64_t numLevels) {
MinMax out{
std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
for (int x = 0; x < numLevels; x++) {
out.min = std::min(levels[x], out.min);
out.max = std::max(levels[x], out.max);
}
return out;
}

} // namespace facebook::velox::parquet
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,20 @@

#pragma once

#include <algorithm>
#include <cstdint>

#include "velox/dwio/parquet/writer/arrow/Platform.h"

namespace facebook::velox::parquet::arrow::internal {
namespace facebook::velox::parquet {

/// Builds a bitmap where each set bit indicates the corresponding level is
/// greater than rhs.
uint64_t PARQUET_EXPORT
GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs);
uint64_t
GreaterThanBitmap(const int16_t* levels, int64_t numLevels, int16_t rhs);

struct MinMax {
int16_t min;
int16_t max;
};

MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
MinMax FindMinMax(const int16_t* levels, int64_t numLevels);

} // namespace facebook::velox::parquet::arrow::internal
} // namespace facebook::velox::parquet
187 changes: 187 additions & 0 deletions velox/dwio/parquet/common/LevelConversion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// Adapted from Apache Arrow.

#include "velox/dwio/parquet/common/LevelConversion.h"

#include <cassert> // Required for bitmap_writer.h below.
#include <limits>
#include <optional>

#include "arrow/util/bitmap_writer.h"

#include "velox/common/base/Exceptions.h"
#include "velox/dwio/parquet/common/LevelConversionUtil.h"

namespace facebook::velox::parquet {
namespace {

template <typename OffsetType>
void DefRepLevelsToListInfo(
const int16_t* defLevels,
const int16_t* repLevels,
int64_t numDefLevels,
LevelInfo levelInfo,
ValidityBitmapInputOutput* output,
OffsetType* offsets) {
OffsetType* origPos = offsets;
std::optional<::arrow::internal::FirstTimeBitmapWriter> validBitsWriter;
if (output->validBits) {
validBitsWriter.emplace(
output->validBits,
output->validBitsOffset,
output->valuesReadUpperBound);
}
for (int x = 0; x < numDefLevels; x++) {
// Skip items that belong to empty or null ancestor lists and further nested
// lists.
if (defLevels[x] < levelInfo.repeatedAncestorDefLevel ||
repLevels[x] > levelInfo.repLevel) {
continue;
}

if (repLevels[x] == levelInfo.repLevel) {
// A continuation of an existing list.
// offsets can be null for structs with repeated children (we don't need
// to know offsets until we get to the children).
if (offsets != nullptr) {
if (FOLLY_UNLIKELY(
*offsets == std::numeric_limits<OffsetType>::max())) {
VELOX_FAIL("List index overflow.");
}
*offsets += 1;
}
} else {
if (FOLLY_UNLIKELY(
(validBitsWriter.has_value() &&
validBitsWriter->position() >= output->valuesReadUpperBound) ||
(offsets - origPos) >= output->valuesReadUpperBound)) {
VELOX_FAIL(
"Definition levels exceeded upper bound: {}",
output->valuesReadUpperBound);
}

// current_rep < list repLevel i.e. start of a list (ancestor empty lists
// are filtered out above). offsets can be null for structs with repeated
// children (we don't need to know offsets until we get to the children).
if (offsets != nullptr) {
++offsets;
// Use cumulative offsets because variable size lists are more common
// than fixed size lists so it should be cheaper to make these
// cumulative and subtract when validating fixed size lists.
*offsets = *(offsets - 1);
if (defLevels[x] >= levelInfo.defLevel) {
if (FOLLY_UNLIKELY(
*offsets == std::numeric_limits<OffsetType>::max())) {
VELOX_FAIL("List index overflow.");
}
*offsets += 1;
}
}

if (validBitsWriter.has_value()) {
// the levelInfo def level for lists reflects element present level.
// the prior level distinguishes between empty lists.
if (defLevels[x] >= levelInfo.defLevel - 1) {
validBitsWriter->Set();
} else {
output->nullCount++;
validBitsWriter->Clear();
}
validBitsWriter->Next();
}
}
}
if (validBitsWriter.has_value()) {
validBitsWriter->Finish();
}
if (offsets != nullptr) {
output->valuesRead = offsets - origPos;
} else if (validBitsWriter.has_value()) {
output->valuesRead = validBitsWriter->position();
}
if (output->nullCount > 0 && levelInfo.nullSlotUsage > 1) {
VELOX_FAIL(
"Null values with nullSlotUsage > 1 not supported."
"(i.e. FixedSizeLists with null values are not supported)");
}
}

} // namespace

void DefLevelsToBitmap(
const int16_t* defLevels,
int64_t numDefLevels,
LevelInfo levelInfo,
ValidityBitmapInputOutput* output) {
// It is simpler to rely on repLevel here until PARQUET-1899 is done and the
// code is deleted in a follow-up release.
if (levelInfo.repLevel > 0) {
DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
defLevels, numDefLevels, levelInfo, output);
} else {
DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
defLevels, numDefLevels, levelInfo, output);
}
}

uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selectBitmap) {
return ExtractBitsSoftware(bitmap, selectBitmap);
}

void DefRepLevelsToList(
const int16_t* defLevels,
const int16_t* repLevels,
int64_t numDefLevels,
LevelInfo levelInfo,
ValidityBitmapInputOutput* output,
int32_t* offsets) {
DefRepLevelsToListInfo<int32_t>(
defLevels, repLevels, numDefLevels, levelInfo, output, offsets);
}

void DefRepLevelsToList(
const int16_t* defLevels,
const int16_t* repLevels,
int64_t numDefLevels,
LevelInfo levelInfo,
ValidityBitmapInputOutput* output,
int64_t* offsets) {
DefRepLevelsToListInfo<int64_t>(
defLevels, repLevels, numDefLevels, levelInfo, output, offsets);
}

void DefRepLevelsToBitmap(
const int16_t* defLevels,
const int16_t* repLevels,
int64_t numDefLevels,
LevelInfo levelInfo,
ValidityBitmapInputOutput* output) {
// DefRepLevelsToListInfo assumes it for the actual list method and this
// method is for parent structs, so we need to bump def and ref level.
levelInfo.repLevel += 1;
levelInfo.defLevel += 1;
DefRepLevelsToListInfo<int32_t>(
defLevels,
repLevels,
numDefLevels,
levelInfo,
output,
/*offsets=*/nullptr);
}

} // namespace facebook::velox::parquet
Loading

0 comments on commit e983aac

Please sign in to comment.