From d1d4f1e1d528da18893b1630b49346ea637a16e2 Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Thu, 8 Feb 2024 21:39:01 -0800 Subject: [PATCH] Refactor Parquet Statistics (#8658) Summary: Remove Statistics.h/cpp and move contents to Metadata.cpp Pull Request resolved: https://github.com/facebookincubator/velox/pull/8658 Reviewed By: mbasmanova Differential Revision: D53542828 Pulled By: pedroerp fbshipit-source-id: 860984a2a79213766a1ed6a2a416221d94d9613f --- velox/dwio/parquet/reader/CMakeLists.txt | 1 - velox/dwio/parquet/reader/Metadata.cpp | 135 +++++++++++++++++- .../parquet/reader/ParquetColumnReader.cpp | 2 - velox/dwio/parquet/reader/ParquetData.cpp | 1 - velox/dwio/parquet/reader/Statistics.cpp | 113 --------------- velox/dwio/parquet/reader/Statistics.h | 84 ----------- 6 files changed, 133 insertions(+), 203 deletions(-) delete mode 100644 velox/dwio/parquet/reader/Statistics.cpp delete mode 100644 velox/dwio/parquet/reader/Statistics.h diff --git a/velox/dwio/parquet/reader/CMakeLists.txt b/velox/dwio/parquet/reader/CMakeLists.txt index 3fb5250b7e64..fbb38dd64eef 100644 --- a/velox/dwio/parquet/reader/CMakeLists.txt +++ b/velox/dwio/parquet/reader/CMakeLists.txt @@ -23,7 +23,6 @@ add_library( ParquetData.cpp RepeatedColumnReader.cpp RleBpDecoder.cpp - Statistics.cpp StructColumnReader.cpp StringColumnReader.cpp) diff --git a/velox/dwio/parquet/reader/Metadata.cpp b/velox/dwio/parquet/reader/Metadata.cpp index c0fa6ab7ca02..771e68e8a595 100644 --- a/velox/dwio/parquet/reader/Metadata.cpp +++ b/velox/dwio/parquet/reader/Metadata.cpp @@ -15,11 +15,142 @@ */ #include "velox/dwio/parquet/reader/Metadata.h" - -#include "velox/dwio/parquet/reader/Statistics.h" +#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h" namespace facebook::velox::parquet { +template +inline const T load(const char* ptr) { + T ret; + std::memcpy(&ret, ptr, sizeof(ret)); + return ret; +} + +template +inline std::optional getMin(const thrift::Statistics& columnChunkStats) { + return columnChunkStats.__isset.min_value + ? load(columnChunkStats.min_value.data()) + : (columnChunkStats.__isset.min + ? std::optional(load(columnChunkStats.min.data())) + : std::nullopt); +} + +template +inline std::optional getMax(const thrift::Statistics& columnChunkStats) { + return columnChunkStats.__isset.max_value + ? std::optional(load(columnChunkStats.max_value.data())) + : (columnChunkStats.__isset.max + ? std::optional(load(columnChunkStats.max.data())) + : std::nullopt); +} + +template <> +inline std::optional getMin( + const thrift::Statistics& columnChunkStats) { + return columnChunkStats.__isset.min_value + ? std::optional(columnChunkStats.min_value) + : (columnChunkStats.__isset.min ? std::optional(columnChunkStats.min) + : std::nullopt); +} + +template <> +inline std::optional getMax( + const thrift::Statistics& columnChunkStats) { + return columnChunkStats.__isset.max_value + ? std::optional(columnChunkStats.max_value) + : (columnChunkStats.__isset.max ? std::optional(columnChunkStats.max) + : std::nullopt); +} + +std::unique_ptr buildColumnStatisticsFromThrift( + const thrift::Statistics& columnChunkStats, + const velox::Type& type, + uint64_t numRowsInRowGroup) { + std::optional nullCount = columnChunkStats.__isset.null_count + ? std::optional(columnChunkStats.null_count) + : std::nullopt; + std::optional valueCount = nullCount.has_value() + ? std::optional(numRowsInRowGroup - nullCount.value()) + : std::nullopt; + std::optional hasNull = columnChunkStats.__isset.null_count + ? std::optional(columnChunkStats.null_count > 0) + : std::nullopt; + + switch (type.kind()) { + case TypeKind::BOOLEAN: + return std::make_unique( + valueCount, hasNull, std::nullopt, std::nullopt, std::nullopt); + case TypeKind::TINYINT: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::SMALLINT: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::INTEGER: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::BIGINT: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::REAL: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::DOUBLE: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + return std::make_unique( + valueCount, + hasNull, + std::nullopt, + std::nullopt, + getMin(columnChunkStats), + getMax(columnChunkStats), + std::nullopt); + + default: + return std::make_unique( + valueCount, hasNull, std::nullopt, std::nullopt); + } +} + common::CompressionKind thriftCodecToCompressionKind( thrift::CompressionCodec::type codec) { switch (codec) { diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.cpp b/velox/dwio/parquet/reader/ParquetColumnReader.cpp index ea3169ae727a..c3816c0e960a 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.cpp +++ b/velox/dwio/parquet/reader/ParquetColumnReader.cpp @@ -25,10 +25,8 @@ #include "velox/dwio/parquet/reader/FloatingPointColumnReader.h" #include "velox/dwio/parquet/reader/IntegerColumnReader.h" #include "velox/dwio/parquet/reader/RepeatedColumnReader.h" -#include "velox/dwio/parquet/reader/Statistics.h" #include "velox/dwio/parquet/reader/StringColumnReader.h" #include "velox/dwio/parquet/reader/StructColumnReader.h" -#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h" namespace facebook::velox::parquet { diff --git a/velox/dwio/parquet/reader/ParquetData.cpp b/velox/dwio/parquet/reader/ParquetData.cpp index 283190bbfb0a..a2688403ebcd 100644 --- a/velox/dwio/parquet/reader/ParquetData.cpp +++ b/velox/dwio/parquet/reader/ParquetData.cpp @@ -17,7 +17,6 @@ #include "velox/dwio/parquet/reader/ParquetData.h" #include "velox/dwio/common/BufferedInput.h" -#include "velox/dwio/parquet/reader/Statistics.h" namespace facebook::velox::parquet { diff --git a/velox/dwio/parquet/reader/Statistics.cpp b/velox/dwio/parquet/reader/Statistics.cpp deleted file mode 100644 index e7ee86a8b768..000000000000 --- a/velox/dwio/parquet/reader/Statistics.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "velox/dwio/parquet/reader/Statistics.h" - -#include "velox/dwio/common/Statistics.h" -#include "velox/type/Type.h" - -namespace facebook::velox::parquet { - -std::unique_ptr buildColumnStatisticsFromThrift( - const thrift::Statistics& columnChunkStats, - const velox::Type& type, - uint64_t numRowsInRowGroup) { - std::optional nullCount = columnChunkStats.__isset.null_count - ? std::optional(columnChunkStats.null_count) - : std::nullopt; - std::optional valueCount = nullCount.has_value() - ? std::optional(numRowsInRowGroup - nullCount.value()) - : std::nullopt; - std::optional hasNull = columnChunkStats.__isset.null_count - ? std::optional(columnChunkStats.null_count > 0) - : std::nullopt; - - switch (type.kind()) { - case TypeKind::BOOLEAN: - return std::make_unique( - valueCount, hasNull, std::nullopt, std::nullopt, std::nullopt); - case TypeKind::TINYINT: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::SMALLINT: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::INTEGER: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::BIGINT: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::REAL: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::DOUBLE: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - case TypeKind::VARCHAR: - case TypeKind::VARBINARY: - return std::make_unique( - valueCount, - hasNull, - std::nullopt, - std::nullopt, - getMin(columnChunkStats), - getMax(columnChunkStats), - std::nullopt); - - default: - return std::make_unique( - valueCount, hasNull, std::nullopt, std::nullopt); - } -} - -} // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/Statistics.h b/velox/dwio/parquet/reader/Statistics.h deleted file mode 100644 index 18f67d5b13b0..000000000000 --- a/velox/dwio/parquet/reader/Statistics.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h" - -#include -#include - -namespace facebook::velox { -class Type; -} - -namespace facebook::velox::dwio::common { -class ColumnStatistics; -} - -namespace facebook::velox::parquet { - -// TODO: provide function to merge multiple Statistics into one - -template -inline const T load(const char* ptr) { - T ret; - std::memcpy(&ret, ptr, sizeof(ret)); - return ret; -} - -template -inline std::optional getMin(const thrift::Statistics& columnChunkStats) { - return columnChunkStats.__isset.min_value - ? load(columnChunkStats.min_value.data()) - : (columnChunkStats.__isset.min - ? std::optional(load(columnChunkStats.min.data())) - : std::nullopt); -} - -template -inline std::optional getMax(const thrift::Statistics& columnChunkStats) { - return columnChunkStats.__isset.max_value - ? std::optional(load(columnChunkStats.max_value.data())) - : (columnChunkStats.__isset.max - ? std::optional(load(columnChunkStats.max.data())) - : std::nullopt); -} - -template <> -inline std::optional getMin( - const thrift::Statistics& columnChunkStats) { - return columnChunkStats.__isset.min_value - ? std::optional(columnChunkStats.min_value) - : (columnChunkStats.__isset.min ? std::optional(columnChunkStats.min) - : std::nullopt); -} - -template <> -inline std::optional getMax( - const thrift::Statistics& columnChunkStats) { - return columnChunkStats.__isset.max_value - ? std::optional(columnChunkStats.max_value) - : (columnChunkStats.__isset.max ? std::optional(columnChunkStats.max) - : std::nullopt); -} - -std::unique_ptr buildColumnStatisticsFromThrift( - const thrift::Statistics& columnChunkStats, - const velox::Type& type, - uint64_t numRowsInRowGroup); - -} // namespace facebook::velox::parquet