From 4d2903d1a34187ec525798d224c93fd9795ba20c Mon Sep 17 00:00:00 2001 From: HarryLeeIBM Date: Wed, 23 Aug 2023 14:17:19 -0700 Subject: [PATCH] Fix parquet endian issue for s390x --- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/encoding.cc | 22 ++++++++++++++++------ cpp/src/parquet/file_reader.cc | 4 ++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 3670af49fbfaa..10d97a49c97a0 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -117,7 +117,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level, if (data_size < 4) { throw ParquetException("Received invalid levels (corrupt data page?)"); } - num_bytes = ::arrow::util::SafeLoadAs(data); + num_bytes = ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data)); if (num_bytes < 0 || num_bytes > data_size - 4) { throw ParquetException("Received invalid number of bytes (corrupt data page?)"); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b9472d72aebbd..069ab52805307 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1019,7 +1019,7 @@ int PlainDecoder::DecodeArrow( VisitNullBitmapInline( valid_bits, valid_bits_offset, num_values, null_count, [&]() { - builder->UnsafeAppend(::arrow::util::SafeLoadAs(data_)); + builder->UnsafeAppend(::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data_))); data_ += sizeof(value_type); }, [&]() { builder->UnsafeAppendNull(); }); @@ -1047,7 +1047,7 @@ int PlainDecoder::DecodeArrow( valid_bits, valid_bits_offset, num_values, null_count, [&]() { PARQUET_THROW_NOT_OK( - builder->Append(::arrow::util::SafeLoadAs(data_))); + builder->Append(::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data_)))); data_ += sizeof(value_type); }, [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); }); @@ -1067,7 +1067,17 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, } // If bytes_to_decode == 0, data could be null if (bytes_to_decode > 0) { +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) + for (size_t i = 0; i < num_values; ++i) + { + memcpy(out + i, data + sizeof(T) * i, sizeof(T)); + auto begin = reinterpret_cast(out + i); + auto end = begin + sizeof(T); + std::reverse(begin, end); + } +#else memcpy(out, data, bytes_to_decode); +#endif } return static_cast(bytes_to_decode); } @@ -1090,7 +1100,7 @@ static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size, if (ARROW_PREDICT_FALSE(data_size < 4)) { ParquetException::EofException(); } - const int32_t len = ::arrow::util::SafeLoadAs(data); + const int32_t len = ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data)); if (len < 0) { throw ParquetException("Invalid BYTE_ARRAY value"); } @@ -1379,7 +1389,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = ::arrow::util::SafeLoadAs(data_); + auto value_len = ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data_)); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } @@ -1425,7 +1435,7 @@ class PlainByteArrayDecoder : public PlainDecoder, if (ARROW_PREDICT_FALSE(len_ < 4)) { ParquetException::EofException(); } - auto value_len = ::arrow::util::SafeLoadAs(data_); + auto value_len = ::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(data_)); if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) { return Status::Invalid("Invalid or corrupted value_len '", value_len, "'"); } @@ -2984,7 +2994,7 @@ int ByteStreamSplitDecoder::DecodeArrow( const size_t byte_index = b * num_values_in_buffer_ + offset; gathered_byte_data[b] = data[byte_index]; } - builder->UnsafeAppend(::arrow::util::SafeLoadAs(&gathered_byte_data[0])); + builder->UnsafeAppend(::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs(&gathered_byte_data[0]))); ++offset; }, [&]() { builder->UnsafeAppendNull(); }); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 520317539b565..aaddfb8e83c81 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -415,9 +415,9 @@ class SerializedFile : public ParquetFileReader::Contents { "is not a parquet file."); } // Both encrypted/unencrypted footers have the same footer length check. - uint32_t metadata_len = ::arrow::util::SafeLoadAs( + uint32_t metadata_len = ::arrow::bit_util::FromLittleEndian(::arrow::util::SafeLoadAs( reinterpret_cast(footer_buffer->data()) + footer_read_size - - kFooterSize); + kFooterSize)); if (metadata_len > source_size_ - kFooterSize) { throw ParquetInvalidOrCorruptedFileException( "Parquet file size is ", source_size_,