From 96abf91c3b92da95c99b5ff6c35967693c56dcf7 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Tue, 11 Jun 2024 07:51:35 +0000 Subject: [PATCH 01/26] GH-42102: [C++] Add binary that extracts a footer from a parquet file Usage from the file binary itself: ``` Usage: parquet-dump-footer -h|--help Print help and exit --no-scrub Do not scrub potentially PII metadata --json Output JSON instead of binary --in Input file: required --out Output file: defaults to stdout Dumps the footer of a Parquet file to stdout or a file, optionally with potentially PII metadata scrubbed. ``` --- cpp/tools/parquet/CMakeLists.txt | 3 +- cpp/tools/parquet/parquet_dump_footer.cc | 230 +++++++++++++++++++++++ 2 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 cpp/tools/parquet/parquet_dump_footer.cc diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 81ab49421d0f6..c60d84ecdcf73 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan) + set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan parquet-dump-footer) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) @@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) + target_link_libraries(parquet-dump-footer thrift) add_dependencies(parquet ${PARQUET_TOOLS}) endif() diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc new file mode 100644 index 0000000000000..c5ed687851046 --- /dev/null +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "parquet_types.h" + +using apache::thrift::protocol::TCompactProtocol; +using apache::thrift::transport::TMemoryBuffer; +using apache::thrift::transport::TTransport; + +namespace { +int PrintHelp() { + std::cerr << R"( +Usage: parquet-dump-footer + -h|--help Print help and exit + --no-scrub Do not scrub potentially PII metadata + --json Output JSON instead of binary + --in Input file: required + --out Output file: defaults to stdout + + Dumps the footer of a Parquet file to stdout or a file, optionally with + potentially PII metadata scrubbed. +)"; + return 1; +} + +uint32_t ReadLE32(const void* p) { + auto* b = reinterpret_cast(p); + return uint32_t{b[3]} << 24 | uint32_t{b[2]} << 16 | uint32_t{b[1]} << 8 | + uint32_t{b[0]}; +} + +void AppendLE32(uint32_t v, std::string* out) { + out->push_back(v & 0xff); + out->push_back((v >> 8) & 0xff); + out->push_back((v >> 16) & 0xff); + out->push_back((v >> 24) & 0xff); +} + +std::pair MmapFile(const std::string& fname) { + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) return {nullptr, 0}; + struct stat st; + if (fstat(fd, &st)) return {nullptr, 0}; + size_t sz = st.st_size; + void* map = mmap(nullptr, sz, PROT_READ, MAP_SHARED, fd, 0); + return {map == MAP_FAILED ? nullptr : static_cast(map), sz}; +} + +template +bool Deserialize(const char* data, size_t len, T* obj) { + TMemoryBuffer buf(reinterpret_cast(const_cast(data)), len); + TCompactProtocol proto(std::shared_ptr(&buf, [](auto*) {})); + try { + obj->read(&proto); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to deserialize: " << e.what() << "\n"; + return false; + } +} + +template +bool Serialize(const T& obj, std::string* out) { + TMemoryBuffer buf; + TCompactProtocol proto(std::shared_ptr(&buf, [](auto*) {})); + try { + obj.write(&proto); + uint8_t* data; + uint32_t len; + buf.getBuffer(&data, &len); + out->assign(reinterpret_cast(data), len); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to serialize: " << e.what() << "\n"; + return false; + } +} + +void Scrub(std::string* s) { + static char pool[4096]; + static std::mt19937 rng(std::random_device{}()); + static const bool kPoolInit = [] { + std::uniform_int_distribution<> caps(65, 90); + for (size_t i = 0; i < sizeof(pool); i++) pool[i] = caps(rng); + return true; + }(); + (void)kPoolInit; + + const size_t n = s->size(); + s->clear(); + while (s->size() < n) { + size_t m = std::min(n, sizeof(pool) / 2); + std::uniform_int_distribution<> start(0, sizeof(pool) / 2); + s->append(&pool[start(rng)], m); + } +} + +void Scrub(parquet::format::FileMetaData* md) { + for (auto& s : md->schema) { + Scrub(&s.name); + } + for (auto& r : md->row_groups) { + for (auto& c : r.columns) { + Scrub(&c.file_path); + if (c.__isset.meta_data) { + auto& m = c.meta_data; + for (auto& p : m.path_in_schema) Scrub(&p); + for (auto& kv : m.key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&m.statistics.max_value); + Scrub(&m.statistics.min_value); + Scrub(&m.statistics.min); + Scrub(&m.statistics.max); + } + + if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY; + for (auto& p : m.path_in_schema) Scrub(&p); + Scrub(&m.key_metadata); + } + Scrub(&c.encrypted_column_metadata); + } + } + for (auto& kv : md->key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&md->footer_signing_key_metadata); +} +} // namespace + +int main(int argc, char** argv) { + bool help = false; + bool scrub = true; + bool json = false; + std::string in; + std::string out; + for (int i = 1; i < argc; i++) { + char* arg = argv[i]; + help |= !std::strcmp(arg, "-h") || !std::strcmp(arg, "--help"); + scrub &= std::strcmp(arg, "--no-scrub"); + json |= !std::strcmp(arg, "--json"); + if (!std::strcmp(arg, "--in")) { + if (i + 1 >= argc) return PrintHelp(); + in = argv[++i]; + } + if (!std::strcmp(arg, "--out")) { + if (i + 1 >= argc) return PrintHelp(); + out = argv[++i]; + } + } + if (help || in.empty()) return PrintHelp(); + auto [data, len] = MmapFile(in); + + if (len == 0) { + std::cerr << "Failed to read file: " << in << "\n"; + return 3; + } + if (len < 8 || ReadLE32(data + len - 4) != ReadLE32("PAR1")) { + std::cerr << "Not a Parquet file: " << in << "\n"; + return 4; + } + size_t footer_len = ReadLE32(data + len - 8); + if (footer_len > len - 8) { + std::cerr << "Invalid footer length: " << footer_len << "\n"; + return 5; + } + char* footer = data + len - 8 - footer_len; + parquet::format::FileMetaData md; + if (!Deserialize(footer, footer_len, &md)) return 5; + if (scrub) Scrub(&md); + + std::string ser; + if (json) { + if (out.empty()) { + md.printTo(std::cout); + } else { + std::fstream fout(out, std::ios::out); + md.printTo(fout); + } + } else { + int out_fd = out.empty() ? 1 : open(out.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (out_fd < 0) { + std::cerr << "Failed to open output file: " << out << "\n"; + return 2; + } + if (!Serialize(md, &ser)) return 6; + AppendLE32(ser.size(), &ser); + ser.append("PAR1", 4); + if (write(out_fd, ser.data(), ser.size()) != static_cast(ser.size())) { + std::cerr << "Failed to write to output file: " << out << "\n"; + return 7; + } + close(out_fd); + } + + return 0; +} From 20b7c8497e64bb7b7acbbcbccba08ff531b4b2c1 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Mon, 17 Jun 2024 14:40:40 +0000 Subject: [PATCH 02/26] use internal thrift serde --- cpp/tools/parquet/parquet_dump_footer.cc | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index c5ed687851046..66a070c29de6b 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -32,6 +32,7 @@ #include #include "parquet_types.h" +#include "parquet/thrift_internal.h" using apache::thrift::protocol::TCompactProtocol; using apache::thrift::transport::TMemoryBuffer; @@ -77,11 +78,10 @@ std::pair MmapFile(const std::string& fname) { } template -bool Deserialize(const char* data, size_t len, T* obj) { - TMemoryBuffer buf(reinterpret_cast(const_cast(data)), len); - TCompactProtocol proto(std::shared_ptr(&buf, [](auto*) {})); +bool Deserialize(const char* data, uint32_t len, T* obj) { + parquet::ThriftDeserializer des(10 << 20, 10 << 20); try { - obj->read(&proto); + des.DeserializeMessage(reinterpret_cast(data), &len, obj); return true; } catch (const std::exception& e) { std::cerr << "Failed to deserialize: " << e.what() << "\n"; @@ -91,14 +91,9 @@ bool Deserialize(const char* data, size_t len, T* obj) { template bool Serialize(const T& obj, std::string* out) { - TMemoryBuffer buf; - TCompactProtocol proto(std::shared_ptr(&buf, [](auto*) {})); + parquet::ThriftSerializer ser(10 << 20); try { - obj.write(&proto); - uint8_t* data; - uint32_t len; - buf.getBuffer(&data, &len); - out->assign(reinterpret_cast(data), len); + ser.SerializeToString(&obj, out); return true; } catch (const std::exception& e) { std::cerr << "Failed to serialize: " << e.what() << "\n"; From d52c89ab4df6f9975e5a5d1abc23434e9914877f Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Tue, 18 Jun 2024 08:16:13 +0000 Subject: [PATCH 03/26] use arrow filesystem --- cpp/tools/parquet/CMakeLists.txt | 2 +- cpp/tools/parquet/parquet_dump_footer.cc | 106 ++++++++++++++--------- 2 files changed, 64 insertions(+), 44 deletions(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index c60d84ecdcf73..ee9dacac3bd34 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,7 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift) + target_link_libraries(parquet-dump-footer thrift::thrift arrow_static) add_dependencies(parquet ${PARQUET_TOOLS}) endif() diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 66a070c29de6b..8c7d1d8265369 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -15,24 +15,21 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include -#include - #include #include #include #include #include +#include +#include #include -#include #include #include -#include "parquet_types.h" +#include "arrow/filesystem/filesystem.h" #include "parquet/thrift_internal.h" +#include "parquet_types.h" using apache::thrift::protocol::TCompactProtocol; using apache::thrift::transport::TMemoryBuffer; @@ -67,16 +64,6 @@ void AppendLE32(uint32_t v, std::string* out) { out->push_back((v >> 24) & 0xff); } -std::pair MmapFile(const std::string& fname) { - int fd = open(fname.c_str(), O_RDONLY); - if (fd < 0) return {nullptr, 0}; - struct stat st; - if (fstat(fd, &st)) return {nullptr, 0}; - size_t sz = st.st_size; - void* map = mmap(nullptr, sz, PROT_READ, MAP_SHARED, fd, 0); - return {map == MAP_FAILED ? nullptr : static_cast(map), sz}; -} - template bool Deserialize(const char* data, uint32_t len, T* obj) { parquet::ThriftDeserializer des(10 << 20, 10 << 20); @@ -154,6 +141,22 @@ void Scrub(parquet::format::FileMetaData* md) { } Scrub(&md->footer_signing_key_metadata); } + +// Returns: +// - 0 on success +// - -1 on error +// - the size of the footer if tail is too small +int64_t ParseFooter(const std::string& tail, parquet::format::FileMetaData* md) { + if (tail.size() > std::numeric_limits::max()) return -1; + + const char* p = tail.data(); + const int32_t n = static_cast(tail.size()); + int32_t len = ReadLE32(p + n - 8); + if (len > n - 8) return len; + + if (!Deserialize(tail.data() + n - 8 - len, len, md)) return -1; + return 0; +} } // namespace int main(int argc, char** argv) { @@ -165,7 +168,7 @@ int main(int argc, char** argv) { for (int i = 1; i < argc; i++) { char* arg = argv[i]; help |= !std::strcmp(arg, "-h") || !std::strcmp(arg, "--help"); - scrub &= std::strcmp(arg, "--no-scrub"); + scrub &= !!std::strcmp(arg, "--no-scrub"); json |= !std::strcmp(arg, "--json"); if (!std::strcmp(arg, "--in")) { if (i + 1 >= argc) return PrintHelp(); @@ -177,48 +180,65 @@ int main(int argc, char** argv) { } } if (help || in.empty()) return PrintHelp(); - auto [data, len] = MmapFile(in); - - if (len == 0) { - std::cerr << "Failed to read file: " << in << "\n"; + std::string path; + auto fs = arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); + auto file = fs->OpenInputFile(path).ValueOrDie(); + int64_t file_len = file->GetSize().ValueOrDie(); + if (file_len < 8) { + std::cerr << "File too short: " << in << "\n"; return 3; } - if (len < 8 || ReadLE32(data + len - 4) != ReadLE32("PAR1")) { + int64_t tail_len = std::min(file_len, int64_t{1} << 20); + std::string tail; + tail.resize(tail_len); + char* data = tail.data(); + file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); + if (ReadLE32(data + tail_len - 4) != ReadLE32("PAR1")) { std::cerr << "Not a Parquet file: " << in << "\n"; return 4; } - size_t footer_len = ReadLE32(data + len - 8); - if (footer_len > len - 8) { - std::cerr << "Invalid footer length: " << footer_len << "\n"; + parquet::format::FileMetaData md; + int64_t res = ParseFooter(tail, &md); + if (res < 0) { + std::cerr << "Failed to parse footer: " << in << "\n"; return 5; + } else if (res > 0) { + if (res > file_len) { + std::cerr << "File too short: " << in << "\n"; + return 6; + } + tail_len = res + 8; + tail.resize(tail_len); + data = tail.data(); + file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); } - char* footer = data + len - 8 - footer_len; - parquet::format::FileMetaData md; - if (!Deserialize(footer, footer_len, &md)) return 5; + if (ParseFooter(tail, &md) != 0) { + std::cerr << "Failed to parse footer: " << in << "\n"; + return 7; + } + if (scrub) Scrub(&md); - std::string ser; + std::optional fout; if (json) { - if (out.empty()) { - md.printTo(std::cout); - } else { - std::fstream fout(out, std::ios::out); - md.printTo(fout); - } + if (!out.empty()) fout.emplace(out, std::ios::out); + std::ostream& os = fout ? *fout : std::cout; + md.printTo(os); } else { - int out_fd = out.empty() ? 1 : open(out.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (out_fd < 0) { + if (!out.empty()) fout.emplace(out, std::ios::out | std::ios::binary); + std::ostream& os = fout ? *fout : std::cout; + if (!os) { std::cerr << "Failed to open output file: " << out << "\n"; - return 2; + return 8; } + std::string ser; if (!Serialize(md, &ser)) return 6; - AppendLE32(ser.size(), &ser); + AppendLE32(static_cast(ser.size()), &ser); ser.append("PAR1", 4); - if (write(out_fd, ser.data(), ser.size()) != static_cast(ser.size())) { + if (!os.write(ser.data(), ser.size())) { std::cerr << "Failed to write to output file: " << out << "\n"; - return 7; + return 9; } - close(out_fd); } return 0; From 0f06b4756d031aa21782b4006abc5b204609f092 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 19 Jun 2024 06:58:49 +0000 Subject: [PATCH 04/26] satisfy cpplint --- cpp/tools/parquet/parquet_dump_footer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 8c7d1d8265369..e91a03e20fb1f 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -29,7 +29,7 @@ #include "arrow/filesystem/filesystem.h" #include "parquet/thrift_internal.h" -#include "parquet_types.h" +#include "generated/parquet_types.h" using apache::thrift::protocol::TCompactProtocol; using apache::thrift::transport::TMemoryBuffer; @@ -90,7 +90,7 @@ bool Serialize(const T& obj, std::string* out) { void Scrub(std::string* s) { static char pool[4096]; - static std::mt19937 rng(std::random_device{}()); + static std::mt19937 rng(std::random_device {}()); static const bool kPoolInit = [] { std::uniform_int_distribution<> caps(65, 90); for (size_t i = 0; i < sizeof(pool); i++) pool[i] = caps(rng); From 9b50f6bacdb37f089961d58a770b9e4cf406ba53 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Tue, 25 Jun 2024 20:17:08 +0000 Subject: [PATCH 05/26] fixes --- cpp/tools/parquet/parquet_dump_footer.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index e91a03e20fb1f..00647ecfbc7c4 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -28,6 +28,7 @@ #include #include "arrow/filesystem/filesystem.h" +#include "arrow/util/endian.h" #include "parquet/thrift_internal.h" #include "generated/parquet_types.h" @@ -52,21 +53,20 @@ Usage: parquet-dump-footer } uint32_t ReadLE32(const void* p) { - auto* b = reinterpret_cast(p); - return uint32_t{b[3]} << 24 | uint32_t{b[2]} << 16 | uint32_t{b[1]} << 8 | - uint32_t{b[0]}; + uint32_t x; + memcpy(&x, p, sizeof(x)); + return arrow::bit_util::FromLittleEndian(x); } void AppendLE32(uint32_t v, std::string* out) { - out->push_back(v & 0xff); - out->push_back((v >> 8) & 0xff); - out->push_back((v >> 16) & 0xff); - out->push_back((v >> 24) & 0xff); + v = arrow::bit_util::ToLittleEndian(v); + out->append(reinterpret_cast(&v), sizeof(v)); } template bool Deserialize(const char* data, uint32_t len, T* obj) { - parquet::ThriftDeserializer des(10 << 20, 10 << 20); + parquet::ThriftDeserializer des(/*string_size_limit=*/10 << 20, + /*container_size_limit=*/10 << 20); try { des.DeserializeMessage(reinterpret_cast(data), &len, obj); return true; @@ -78,7 +78,7 @@ bool Deserialize(const char* data, uint32_t len, T* obj) { template bool Serialize(const T& obj, std::string* out) { - parquet::ThriftSerializer ser(10 << 20); + parquet::ThriftSerializer ser(/*initial_buffer_size=*/10 << 20); try { ser.SerializeToString(&obj, out); return true; @@ -88,6 +88,7 @@ bool Serialize(const T& obj, std::string* out) { } } +// Replace the contents of s with random data of the same length. void Scrub(std::string* s) { static char pool[4096]; static std::mt19937 rng(std::random_device {}()); From b7b7ee075000e1b622f580c85403848b2b189276 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 11 Jul 2024 07:42:14 +0000 Subject: [PATCH 06/26] . --- cpp/tools/parquet/parquet_dump_footer.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 00647ecfbc7c4..11cf24ec0668d 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -29,6 +29,7 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" #include "parquet/thrift_internal.h" #include "generated/parquet_types.h" @@ -53,8 +54,7 @@ Usage: parquet-dump-footer } uint32_t ReadLE32(const void* p) { - uint32_t x; - memcpy(&x, p, sizeof(x)); + uint32_t x = arrow::util::SafeLoadAs(static_cast(p)); return arrow::bit_util::FromLittleEndian(x); } @@ -65,8 +65,8 @@ void AppendLE32(uint32_t v, std::string* out) { template bool Deserialize(const char* data, uint32_t len, T* obj) { - parquet::ThriftDeserializer des(/*string_size_limit=*/10 << 20, - /*container_size_limit=*/10 << 20); + parquet::ThriftDeserializer des(/*string_size_limit=*/1 << 30, + /*container_size_limit=*/1 << 30); try { des.DeserializeMessage(reinterpret_cast(data), &len, obj); return true; @@ -102,9 +102,11 @@ void Scrub(std::string* s) { const size_t n = s->size(); s->clear(); while (s->size() < n) { - size_t m = std::min(n, sizeof(pool) / 2); - std::uniform_int_distribution<> start(0, sizeof(pool) / 2); - s->append(&pool[start(rng)], m); + // To avoid repeating patterns we start somewhere up to halfway through the pool and + // append up to half the pool chars. + std::uniform_int_distribution half(0, sizeof(pool) / 2 - 1); + size_t m = std::min(n, half(rng) + 1 /* at least one */); + s->append(&pool[half(rng)], m); } } From 5193418460afb89505af8caf16db45495b4948b8 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 11 Jul 2024 07:59:22 +0000 Subject: [PATCH 07/26] . --- cpp/tools/parquet/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index ee9dacac3bd34..f064c5d50679e 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,7 +31,12 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift::thrift arrow_static) + target_link_libraries(parquet-dump-footer thrift::thrift) + if(ARROW_BUILD_SHARED) + target_link_libraries(parquet-dump-footer arrow_shared) + else() + target_link_libraries(parquet-dump-footer arrow_static) + endif() add_dependencies(parquet ${PARQUET_TOOLS}) endif() From 17068680a323857067aa225c230b1a29cee38b3e Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 11 Jul 2024 08:06:45 +0000 Subject: [PATCH 08/26] . --- cpp/tools/parquet/CMakeLists.txt | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index f064c5d50679e..5ca24d0adea40 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,12 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift::thrift) - if(ARROW_BUILD_SHARED) - target_link_libraries(parquet-dump-footer arrow_shared) - else() - target_link_libraries(parquet-dump-footer arrow_static) - endif() + target_link_libraries(parquet-dump-footer thrift::thrift ${ARROW_LIBRARIES}) add_dependencies(parquet ${PARQUET_TOOLS}) endif() From 4e8cd33af94dd96f97fdeac9d36c93723dd6bce0 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 17 Jul 2024 05:57:42 +0000 Subject: [PATCH 09/26] . --- cpp/tools/parquet/parquet_dump_footer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 11cf24ec0668d..ee2392c18b50d 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -48,7 +48,7 @@ Usage: parquet-dump-footer --out Output file: defaults to stdout Dumps the footer of a Parquet file to stdout or a file, optionally with - potentially PII metadata scrubbed. + potentially user specific metadata scrubbed. )"; return 1; } From 515c4017f82f88ee5c11afd261206ce8bf212c37 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 17 Jul 2024 07:36:20 +0000 Subject: [PATCH 10/26] . --- cpp/tools/parquet/parquet_dump_footer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index ee2392c18b50d..95f22ab66ace3 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -30,8 +30,8 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/util/endian.h" #include "arrow/util/ubsan.h" -#include "parquet/thrift_internal.h" #include "generated/parquet_types.h" +#include "parquet/thrift_internal.h" using apache::thrift::protocol::TCompactProtocol; using apache::thrift::transport::TMemoryBuffer; @@ -91,7 +91,7 @@ bool Serialize(const T& obj, std::string* out) { // Replace the contents of s with random data of the same length. void Scrub(std::string* s) { static char pool[4096]; - static std::mt19937 rng(std::random_device {}()); + static std::mt19937 rng(std::random_device{}()); static const bool kPoolInit = [] { std::uniform_int_distribution<> caps(65, 90); for (size_t i = 0; i < sizeof(pool); i++) pool[i] = caps(rng); From 4de849d2532f33dbdde2458834a07126f315ba30 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 17 Jul 2024 07:36:24 +0000 Subject: [PATCH 11/26] Revert "." This reverts commit 7d896e49b0de076e2010fa50fff3b4c23788144b. --- cpp/tools/parquet/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 5ca24d0adea40..f064c5d50679e 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,7 +31,12 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift::thrift ${ARROW_LIBRARIES}) + target_link_libraries(parquet-dump-footer thrift::thrift) + if(ARROW_BUILD_SHARED) + target_link_libraries(parquet-dump-footer arrow_shared) + else() + target_link_libraries(parquet-dump-footer arrow_static) + endif() add_dependencies(parquet ${PARQUET_TOOLS}) endif() From 164331981b54cae1b85435a1ac1acb97fcd51ce3 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Wed, 17 Jul 2024 08:17:48 +0000 Subject: [PATCH 12/26] . --- cpp/tools/parquet/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index f064c5d50679e..2060531aab41d 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,12 +31,12 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift::thrift) if(ARROW_BUILD_SHARED) target_link_libraries(parquet-dump-footer arrow_shared) else() target_link_libraries(parquet-dump-footer arrow_static) endif() + target_link_libraries(parquet-dump-footer thrift::thrift) add_dependencies(parquet ${PARQUET_TOOLS}) endif() From a2d492906223762a0553f1873f4e90da90ae845f Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 07:26:51 +0000 Subject: [PATCH 13/26] remove arrow linking --- cpp/tools/parquet/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 2060531aab41d..05edeaf148c86 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,11 +31,6 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - if(ARROW_BUILD_SHARED) - target_link_libraries(parquet-dump-footer arrow_shared) - else() - target_link_libraries(parquet-dump-footer arrow_static) - endif() target_link_libraries(parquet-dump-footer thrift::thrift) add_dependencies(parquet ${PARQUET_TOOLS}) From 60ddd1110897035613eebe6c56d44df6a98f39e1 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 09:24:52 +0000 Subject: [PATCH 14/26] only build footer dump tool when statically building --- cpp/tools/parquet/CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 05edeaf148c86..6465824107702 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan parquet-dump-footer) + set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) @@ -31,7 +31,13 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - target_link_libraries(parquet-dump-footer thrift::thrift) + + # Only build parquet-dump-footer when statically linking. + if(ARROW_BUILD_SHARED) + else() + add_executable(parquet-dump-footer parquet-dump-footer.cc) + target_link_libraries(parquet-dump-footer parquet_static thrift::thrift) + endif() add_dependencies(parquet ${PARQUET_TOOLS}) endif() From d89ce2cca3ffbcb74af43751f02fe0a14803c41f Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 09:30:36 +0000 Subject: [PATCH 15/26] . --- cpp/tools/parquet/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 6465824107702..f3da282191d21 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -37,6 +37,10 @@ if(PARQUET_BUILD_EXECUTABLES) else() add_executable(parquet-dump-footer parquet-dump-footer.cc) target_link_libraries(parquet-dump-footer parquet_static thrift::thrift) + set_target_properties(parquet-dump-footer PROPERTIES + INSTALL_RPATH_USE_LINK_PATH TRUE) + install(TARGETS parquet-dump-footer ${INSTALL_IS_OPTIONAL} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() add_dependencies(parquet ${PARQUET_TOOLS}) From deab8ceb92b7bfa7e9ed15d4b5f9bf4041590fad Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 11:48:47 +0000 Subject: [PATCH 16/26] . --- cpp/tools/parquet/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index f3da282191d21..b56337cc36571 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -34,11 +34,11 @@ if(PARQUET_BUILD_EXECUTABLES) # Only build parquet-dump-footer when statically linking. if(ARROW_BUILD_SHARED) + else() add_executable(parquet-dump-footer parquet-dump-footer.cc) target_link_libraries(parquet-dump-footer parquet_static thrift::thrift) - set_target_properties(parquet-dump-footer PROPERTIES - INSTALL_RPATH_USE_LINK_PATH TRUE) + set_target_properties(parquet-dump-footer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) install(TARGETS parquet-dump-footer ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() From 630550b7307afac854e61fb49fb863776862a6ca Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 14:51:02 +0000 Subject: [PATCH 17/26] . --- cpp/tools/parquet/parquet_dump_footer.cc | 106 +++++++++++------------ 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 95f22ab66ace3..ac4351be3f3f4 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -37,29 +37,15 @@ using apache::thrift::protocol::TCompactProtocol; using apache::thrift::transport::TMemoryBuffer; using apache::thrift::transport::TTransport; +namespace parquet { namespace { -int PrintHelp() { - std::cerr << R"( -Usage: parquet-dump-footer - -h|--help Print help and exit - --no-scrub Do not scrub potentially PII metadata - --json Output JSON instead of binary - --in Input file: required - --out Output file: defaults to stdout - - Dumps the footer of a Parquet file to stdout or a file, optionally with - potentially user specific metadata scrubbed. -)"; - return 1; -} - uint32_t ReadLE32(const void* p) { - uint32_t x = arrow::util::SafeLoadAs(static_cast(p)); - return arrow::bit_util::FromLittleEndian(x); + uint32_t x = ::arrow::util::SafeLoadAs(static_cast(p)); + return ::arrow::bit_util::FromLittleEndian(x); } void AppendLE32(uint32_t v, std::string* out) { - v = arrow::bit_util::ToLittleEndian(v); + v = ::arrow::bit_util::ToLittleEndian(v); out->append(reinterpret_cast(&v), sizeof(v)); } @@ -90,24 +76,9 @@ bool Serialize(const T& obj, std::string* out) { // Replace the contents of s with random data of the same length. void Scrub(std::string* s) { - static char pool[4096]; static std::mt19937 rng(std::random_device{}()); - static const bool kPoolInit = [] { - std::uniform_int_distribution<> caps(65, 90); - for (size_t i = 0; i < sizeof(pool); i++) pool[i] = caps(rng); - return true; - }(); - (void)kPoolInit; - - const size_t n = s->size(); - s->clear(); - while (s->size() < n) { - // To avoid repeating patterns we start somewhere up to halfway through the pool and - // append up to half the pool chars. - std::uniform_int_distribution half(0, sizeof(pool) / 2 - 1); - size_t m = std::min(n, half(rng) + 1 /* at least one */); - s->append(&pool[half(rng)], m); - } + std::uniform_int_distribution<> caps(65, 90); + for (auto& c : *s) c = caps(rng); } void Scrub(parquet::format::FileMetaData* md) { @@ -162,29 +133,9 @@ int64_t ParseFooter(const std::string& tail, parquet::format::FileMetaData* md) } } // namespace -int main(int argc, char** argv) { - bool help = false; - bool scrub = true; - bool json = false; - std::string in; - std::string out; - for (int i = 1; i < argc; i++) { - char* arg = argv[i]; - help |= !std::strcmp(arg, "-h") || !std::strcmp(arg, "--help"); - scrub &= !!std::strcmp(arg, "--no-scrub"); - json |= !std::strcmp(arg, "--json"); - if (!std::strcmp(arg, "--in")) { - if (i + 1 >= argc) return PrintHelp(); - in = argv[++i]; - } - if (!std::strcmp(arg, "--out")) { - if (i + 1 >= argc) return PrintHelp(); - out = argv[++i]; - } - } - if (help || in.empty()) return PrintHelp(); +int DoIt(std::string in, bool scrub, bool json, std::string out) { std::string path; - auto fs = arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); + auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); auto file = fs->OpenInputFile(path).ValueOrDie(); int64_t file_len = file->GetSize().ValueOrDie(); if (file_len < 8) { @@ -246,3 +197,44 @@ int main(int argc, char** argv) { return 0; } +} // namespace parquet + +static int PrintHelp() { + std::cerr << R"( +Usage: parquet-dump-footer + -h|--help Print help and exit + --no-scrub Do not scrub potentially user specific metadata + --json Output JSON instead of binary + --in Input file: required + --out Output file: defaults to stdout + + Dumps the footer of a Parquet file to stdout or a file, optionally with + potentially user specific metadata scrubbed. +)"; + return 1; +} + +int main(int argc, char** argv) { + bool help = false; + bool scrub = true; + bool json = false; + std::string in; + std::string out; + for (int i = 1; i < argc; i++) { + char* arg = argv[i]; + help |= !std::strcmp(arg, "-h") || !std::strcmp(arg, "--help"); + scrub &= !!std::strcmp(arg, "--no-scrub"); + json |= !std::strcmp(arg, "--json"); + if (!std::strcmp(arg, "--in")) { + if (i + 1 >= argc) return PrintHelp(); + in = argv[++i]; + } + if (!std::strcmp(arg, "--out")) { + if (i + 1 >= argc) return PrintHelp(); + out = argv[++i]; + } + } + if (help || in.empty()) return PrintHelp(); + + return parquet::DoIt(in, scrub, json, out); +} From 292f64015f392042d46bdf53c286ad1549cfca56 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Thu, 18 Jul 2024 15:33:30 +0000 Subject: [PATCH 18/26] . --- cpp/tools/parquet/parquet_dump_footer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index ac4351be3f3f4..b90cdb83babc5 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -203,13 +203,13 @@ static int PrintHelp() { std::cerr << R"( Usage: parquet-dump-footer -h|--help Print help and exit - --no-scrub Do not scrub potentially user specific metadata + --no-scrub Do not scrub potentially confidential metadata --json Output JSON instead of binary --in Input file: required --out Output file: defaults to stdout Dumps the footer of a Parquet file to stdout or a file, optionally with - potentially user specific metadata scrubbed. + potentially confidential metadata scrubbed. )"; return 1; } From 25ae725f8a7b223ca0bceaa688959ad50cc1c7c9 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Fri, 19 Jul 2024 11:34:58 +0000 Subject: [PATCH 19/26] move code into parquet API --- cpp/src/parquet/metadata.cc | 63 ++++++++++ cpp/src/parquet/metadata.h | 7 ++ cpp/tools/parquet/CMakeLists.txt | 13 +- cpp/tools/parquet/parquet_dump_footer.cc | 149 +++-------------------- 4 files changed, 89 insertions(+), 143 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index ee8391818962c..c01cb6a5e9f4b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,6 +31,7 @@ #include "arrow/io/memory.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/pcg_random.h" #include "parquet/encryption/encryption_internal.h" #include "parquet/encryption/internal_file_decryptor.h" #include "parquet/exception.h" @@ -599,6 +602,47 @@ std::vector RowGroupMetaData::sorting_columns() const { return impl_->sorting_columns(); } +static void Scrub(std::string* s) { + static ::arrow::random::pcg64 rng; + std::uniform_int_distribution<> caps(65, 90); + for (auto& c : *s) c = caps(rng); +} + +static void Scrub(format::FileMetaData* md) { + for (auto& s : md->schema) { + Scrub(&s.name); + } + for (auto& r : md->row_groups) { + for (auto& c : r.columns) { + Scrub(&c.file_path); + if (c.__isset.meta_data) { + auto& m = c.meta_data; + for (auto& p : m.path_in_schema) Scrub(&p); + for (auto& kv : m.key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&m.statistics.max_value); + Scrub(&m.statistics.min_value); + Scrub(&m.statistics.min); + Scrub(&m.statistics.max); + } + + if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { + auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY; + for (auto& p : m.path_in_schema) Scrub(&p); + Scrub(&m.key_metadata); + } + Scrub(&c.encrypted_column_metadata); + } + } + for (auto& kv : md->key_value_metadata) { + Scrub(&kv.key); + Scrub(&kv.value); + } + Scrub(&md->footer_signing_key_metadata); +} + // file metadata class FileMetaData::FileMetaDataImpl { public: @@ -821,6 +865,21 @@ class FileMetaData::FileMetaDataImpl { return out; } + std::string SerializeUnencrypted(bool scrub, bool json) const { + auto md = *metadata_; + if (scrub) Scrub(&md); + if (json) { + std::ostringstream ss; + md.printTo(ss); + return ss.str(); + } else { + ThriftSerializer serializer; + std::string out; + serializer.SerializeToString(&md, &out); + return out; + } + } + void set_file_decryptor(std::shared_ptr file_decryptor) { file_decryptor_ = std::move(file_decryptor); } @@ -992,6 +1051,10 @@ std::shared_ptr FileMetaData::Subset( return impl_->Subset(row_groups); } +std::string FileMetaData::SerializeUnencrypted(bool scrub, bool json) const { + return impl_->SerializeUnencrypted(scrub, json); +} + void FileMetaData::WriteTo(::arrow::io::OutputStream* dst, const std::shared_ptr& encryptor) const { return impl_->WriteTo(dst, encryptor); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 9fc30df58e0d3..47b74ac1719a1 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -396,6 +396,13 @@ class PARQUET_EXPORT FileMetaData { /// FileMetaData. std::shared_ptr Subset(const std::vector& row_groups) const; + /// \brief Serializes metadata unencrypted to a string. + /// + /// \param[in] scrub removes sensitive information from the metadata. + /// \param[in] json indicates if the metadata should be serialized as JSON, otherwise + /// thrift. + std::string SerializeUnencrypted(bool scrub, bool json) const; + private: friend FileMetaDataBuilder; friend class SerializedFile; diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index b56337cc36571..4c4f740069a67 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan) + set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan parquet-dump-footer) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) @@ -32,16 +32,5 @@ if(PARQUET_BUILD_EXECUTABLES) RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) - # Only build parquet-dump-footer when statically linking. - if(ARROW_BUILD_SHARED) - - else() - add_executable(parquet-dump-footer parquet-dump-footer.cc) - target_link_libraries(parquet-dump-footer parquet_static thrift::thrift) - set_target_properties(parquet-dump-footer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) - install(TARGETS parquet-dump-footer ${INSTALL_IS_OPTIONAL} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - endif() - add_dependencies(parquet ${PARQUET_TOOLS}) endif() diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index b90cdb83babc5..749b6ec03816b 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -17,25 +17,14 @@ #include #include -#include #include #include -#include #include -#include - -#include -#include #include "arrow/filesystem/filesystem.h" #include "arrow/util/endian.h" #include "arrow/util/ubsan.h" -#include "generated/parquet_types.h" -#include "parquet/thrift_internal.h" - -using apache::thrift::protocol::TCompactProtocol; -using apache::thrift::transport::TMemoryBuffer; -using apache::thrift::transport::TTransport; +#include "parquet/metadata.h" namespace parquet { namespace { @@ -49,90 +38,6 @@ void AppendLE32(uint32_t v, std::string* out) { out->append(reinterpret_cast(&v), sizeof(v)); } -template -bool Deserialize(const char* data, uint32_t len, T* obj) { - parquet::ThriftDeserializer des(/*string_size_limit=*/1 << 30, - /*container_size_limit=*/1 << 30); - try { - des.DeserializeMessage(reinterpret_cast(data), &len, obj); - return true; - } catch (const std::exception& e) { - std::cerr << "Failed to deserialize: " << e.what() << "\n"; - return false; - } -} - -template -bool Serialize(const T& obj, std::string* out) { - parquet::ThriftSerializer ser(/*initial_buffer_size=*/10 << 20); - try { - ser.SerializeToString(&obj, out); - return true; - } catch (const std::exception& e) { - std::cerr << "Failed to serialize: " << e.what() << "\n"; - return false; - } -} - -// Replace the contents of s with random data of the same length. -void Scrub(std::string* s) { - static std::mt19937 rng(std::random_device{}()); - std::uniform_int_distribution<> caps(65, 90); - for (auto& c : *s) c = caps(rng); -} - -void Scrub(parquet::format::FileMetaData* md) { - for (auto& s : md->schema) { - Scrub(&s.name); - } - for (auto& r : md->row_groups) { - for (auto& c : r.columns) { - Scrub(&c.file_path); - if (c.__isset.meta_data) { - auto& m = c.meta_data; - for (auto& p : m.path_in_schema) Scrub(&p); - for (auto& kv : m.key_value_metadata) { - Scrub(&kv.key); - Scrub(&kv.value); - } - Scrub(&m.statistics.max_value); - Scrub(&m.statistics.min_value); - Scrub(&m.statistics.min); - Scrub(&m.statistics.max); - } - - if (c.crypto_metadata.__isset.ENCRYPTION_WITH_COLUMN_KEY) { - auto& m = c.crypto_metadata.ENCRYPTION_WITH_COLUMN_KEY; - for (auto& p : m.path_in_schema) Scrub(&p); - Scrub(&m.key_metadata); - } - Scrub(&c.encrypted_column_metadata); - } - } - for (auto& kv : md->key_value_metadata) { - Scrub(&kv.key); - Scrub(&kv.value); - } - Scrub(&md->footer_signing_key_metadata); -} - -// Returns: -// - 0 on success -// - -1 on error -// - the size of the footer if tail is too small -int64_t ParseFooter(const std::string& tail, parquet::format::FileMetaData* md) { - if (tail.size() > std::numeric_limits::max()) return -1; - - const char* p = tail.data(); - const int32_t n = static_cast(tail.size()); - int32_t len = ReadLE32(p + n - 8); - if (len > n - 8) return len; - - if (!Deserialize(tail.data() + n - 8 - len, len, md)) return -1; - return 0; -} -} // namespace - int DoIt(std::string in, bool scrub, bool json, std::string out) { std::string path; auto fs = ::arrow::fs::FileSystemFromUriOrPath(in, &path).ValueOrDie(); @@ -147,56 +52,38 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { tail.resize(tail_len); char* data = tail.data(); file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); - if (ReadLE32(data + tail_len - 4) != ReadLE32("PAR1")) { + if (auto magic = ReadLE32(data + tail_len - 4); magic != ReadLE32("PAR1")) { std::cerr << "Not a Parquet file: " << in << "\n"; return 4; } - parquet::format::FileMetaData md; - int64_t res = ParseFooter(tail, &md); - if (res < 0) { - std::cerr << "Failed to parse footer: " << in << "\n"; - return 5; - } else if (res > 0) { - if (res > file_len) { + uint32_t metadata_len = ReadLE32(data + tail_len - 8); + if (metadata_len > tail_len - 8) { + if (metadata_len > file_len) { std::cerr << "File too short: " << in << "\n"; - return 6; + return 5; } - tail_len = res + 8; + tail_len = metadata_len + 8; tail.resize(tail_len); data = tail.data(); file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); } - if (ParseFooter(tail, &md) != 0) { - std::cerr << "Failed to parse footer: " << in << "\n"; - return 7; - } - - if (scrub) Scrub(&md); - - std::optional fout; - if (json) { - if (!out.empty()) fout.emplace(out, std::ios::out); - std::ostream& os = fout ? *fout : std::cout; - md.printTo(os); - } else { - if (!out.empty()) fout.emplace(out, std::ios::out | std::ios::binary); - std::ostream& os = fout ? *fout : std::cout; - if (!os) { - std::cerr << "Failed to open output file: " << out << "\n"; - return 8; - } - std::string ser; - if (!Serialize(md, &ser)) return 6; + auto md = FileMetaData::Make(tail.data(), &metadata_len); + std::string ser = md->SerializeUnencrypted(scrub, json); + if (!json) { AppendLE32(static_cast(ser.size()), &ser); ser.append("PAR1", 4); - if (!os.write(ser.data(), ser.size())) { - std::cerr << "Failed to write to output file: " << out << "\n"; - return 9; - } + } + std::optional fout; + if (!out.empty()) fout.emplace(out, std::ios::out); + std::ostream& os = fout ? *fout : std::cout; + if (!os.write(ser.data(), ser.size())) { + std::cerr << "Failed to write to output file: " << out << "\n"; + return 6; } return 0; } +} // namespace } // namespace parquet static int PrintHelp() { From 0b13e98993d877b2be9f53d8377e21cb545d0366 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Fri, 19 Jul 2024 12:11:34 +0000 Subject: [PATCH 20/26] . --- cpp/tools/parquet/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 4c4f740069a67..0b63bbfb4ea53 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -31,6 +31,7 @@ if(PARQUET_BUILD_EXECUTABLES) install(TARGETS ${TOOL} ${INSTALL_IS_OPTIONAL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endforeach(TOOL) + target_link_libraries(parquet-dump-footer ${ARROW_LIBRARIES}) add_dependencies(parquet ${PARQUET_TOOLS}) endif() From 644672109089d4bd9b506b8a3347784e043d1faa Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 22 Jul 2024 14:55:58 +0200 Subject: [PATCH 21/26] Fix reading small footers, improve option parsing --- cpp/tools/parquet/parquet_dump_footer.cc | 31 +++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 749b6ec03816b..fa5124e897b07 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -47,6 +47,7 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { std::cerr << "File too short: " << in << "\n"; return 3; } + // Do a first opportunistic read of up to 1 MiB to try and get the entire footer int64_t tail_len = std::min(file_len, int64_t{1} << 20); std::string tail; tail.resize(tail_len); @@ -58,6 +59,7 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { } uint32_t metadata_len = ReadLE32(data + tail_len - 8); if (metadata_len > tail_len - 8) { + // The footer is larger than the initial read, read again the exact size if (metadata_len > file_len) { std::cerr << "File too short: " << in << "\n"; return 5; @@ -66,6 +68,9 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { tail.resize(tail_len); data = tail.data(); file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); + } else { + // Keep the footer + the magic bytes + tail = tail.substr(tail_len - (metadata_len + 8)); } auto md = FileMetaData::Make(tail.data(), &metadata_len); std::string ser = md->SerializeUnencrypted(scrub, json); @@ -92,36 +97,40 @@ Usage: parquet-dump-footer -h|--help Print help and exit --no-scrub Do not scrub potentially confidential metadata --json Output JSON instead of binary - --in Input file: required - --out Output file: defaults to stdout + --in Input file (required): must be an URI or an absolute local path + --out Output file (optional, default stdout) - Dumps the footer of a Parquet file to stdout or a file, optionally with + Dump the footer of a Parquet file to stdout or to a file, optionally with potentially confidential metadata scrubbed. )"; return 1; } int main(int argc, char** argv) { - bool help = false; bool scrub = true; bool json = false; std::string in; std::string out; for (int i = 1; i < argc; i++) { char* arg = argv[i]; - help |= !std::strcmp(arg, "-h") || !std::strcmp(arg, "--help"); - scrub &= !!std::strcmp(arg, "--no-scrub"); - json |= !std::strcmp(arg, "--json"); - if (!std::strcmp(arg, "--in")) { + if (!std::strcmp(arg, "-h") || !std::strcmp(arg, "--help")) { + return PrintHelp(); + } else if (!std::strcmp(arg, "--no-scrub")) { + scrub = false; + } else if (!std::strcmp(arg, "--json")) { + json = true; + } else if (!std::strcmp(arg, "--in")) { if (i + 1 >= argc) return PrintHelp(); in = argv[++i]; - } - if (!std::strcmp(arg, "--out")) { + } else if (!std::strcmp(arg, "--out")) { if (i + 1 >= argc) return PrintHelp(); out = argv[++i]; + } else { + // Unknown option + return PrintHelp(); } } - if (help || in.empty()) return PrintHelp(); + if (in.empty()) return PrintHelp(); return parquet::DoIt(in, scrub, json, out); } From f6862f900cab39860429d6a0b6816956348e7452 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 22 Jul 2024 15:03:44 +0200 Subject: [PATCH 22/26] Actually generate JSON --- cpp/src/parquet/metadata.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c01cb6a5e9f4b..e0230b729b363 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -39,6 +39,9 @@ #include "parquet/schema_internal.h" #include "parquet/thrift_internal.h" +// Include this after thrift_internal.h +#include + namespace parquet { const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { @@ -869,9 +872,7 @@ class FileMetaData::FileMetaDataImpl { auto md = *metadata_; if (scrub) Scrub(&md); if (json) { - std::ostringstream ss; - md.printTo(ss); - return ss.str(); + return apache::thrift::ThriftJSONString(md); } else { ThriftSerializer serializer; std::string out; From 2bdd2358a950a7bd2ec324f1ccb1bf108e082b4c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 22 Jul 2024 15:28:09 +0200 Subject: [PATCH 23/26] Minor nits --- cpp/src/parquet/metadata.cc | 2 ++ cpp/src/parquet/metadata.h | 8 ++++---- cpp/tools/parquet/CMakeLists.txt | 2 +- cpp/tools/parquet/parquet_dump_footer.cc | 3 +-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e0230b729b363..30d8790433db6 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -605,12 +605,14 @@ std::vector RowGroupMetaData::sorting_columns() const { return impl_->sorting_columns(); } +// Replace string data with random-generated uppercase characters static void Scrub(std::string* s) { static ::arrow::random::pcg64 rng; std::uniform_int_distribution<> caps(65, 90); for (auto& c : *s) c = caps(rng); } +// Replace potentially sensitive metadata with random data static void Scrub(format::FileMetaData* md) { for (auto& s : md->schema) { Scrub(&s.name); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 47b74ac1719a1..ae00688f2bdcc 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -396,11 +396,11 @@ class PARQUET_EXPORT FileMetaData { /// FileMetaData. std::shared_ptr Subset(const std::vector& row_groups) const; - /// \brief Serializes metadata unencrypted to a string. + /// \brief Serialize metadata unencrypted as string /// - /// \param[in] scrub removes sensitive information from the metadata. - /// \param[in] json indicates if the metadata should be serialized as JSON, otherwise - /// thrift. + /// \param[in] scrub whether to remove sensitive information from the metadata. + /// \param[in] json whether to serialize the metadata as JSON (if true), otherwise + /// as Thrift (if false). std::string SerializeUnencrypted(bool scrub, bool json) const; private: diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 0b63bbfb4ea53..e05645da28a0e 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-schema parquet-reader parquet-scan parquet-dump-footer) + set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader parquet-scan) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index fa5124e897b07..4a7cc5b4e63b0 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -92,8 +92,7 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { } // namespace parquet static int PrintHelp() { - std::cerr << R"( -Usage: parquet-dump-footer + std::cerr << R"(Usage: parquet-dump-footer -h|--help Print help and exit --no-scrub Do not scrub potentially confidential metadata --json Output JSON instead of binary From 4619a85bff91b12eaade2e4c21d338393d422dac Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Mon, 22 Jul 2024 13:52:04 +0000 Subject: [PATCH 24/26] Revert "Actually generate JSON" This reverts commit f6862f900cab39860429d6a0b6816956348e7452. --- cpp/src/parquet/metadata.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 30d8790433db6..e9023f75a7b1f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -39,9 +39,6 @@ #include "parquet/schema_internal.h" #include "parquet/thrift_internal.h" -// Include this after thrift_internal.h -#include - namespace parquet { const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() { @@ -874,7 +871,9 @@ class FileMetaData::FileMetaDataImpl { auto md = *metadata_; if (scrub) Scrub(&md); if (json) { - return apache::thrift::ThriftJSONString(md); + std::ostringstream ss; + md.printTo(ss); + return ss.str(); } else { ThriftSerializer serializer; std::string out; From 44f9768e8619c4011cc99ca859989fcd2fbba360 Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Mon, 22 Jul 2024 13:57:31 +0000 Subject: [PATCH 25/26] name the option to dump text as --debug instead of --json --- cpp/src/parquet/metadata.cc | 4 ++-- cpp/src/parquet/metadata.h | 6 +++--- cpp/tools/parquet/parquet_dump_footer.cc | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index e9023f75a7b1f..7bab9104619ce 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -867,10 +867,10 @@ class FileMetaData::FileMetaDataImpl { return out; } - std::string SerializeUnencrypted(bool scrub, bool json) const { + std::string SerializeUnencrypted(bool scrub, bool debug) const { auto md = *metadata_; if (scrub) Scrub(&md); - if (json) { + if (debug) { std::ostringstream ss; md.printTo(ss); return ss.str(); diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index ae00688f2bdcc..e02d2e7c852f0 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -399,9 +399,9 @@ class PARQUET_EXPORT FileMetaData { /// \brief Serialize metadata unencrypted as string /// /// \param[in] scrub whether to remove sensitive information from the metadata. - /// \param[in] json whether to serialize the metadata as JSON (if true), otherwise - /// as Thrift (if false). - std::string SerializeUnencrypted(bool scrub, bool json) const; + /// \param[in] debug whether to serialize the metadata as Thrift (if false) or + /// debug text (if true). + std::string SerializeUnencrypted(bool scrub, bool debug) const; private: friend FileMetaDataBuilder; diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index 4a7cc5b4e63b0..fe95541c35ad4 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -95,7 +95,7 @@ static int PrintHelp() { std::cerr << R"(Usage: parquet-dump-footer -h|--help Print help and exit --no-scrub Do not scrub potentially confidential metadata - --json Output JSON instead of binary + --debug Output text represenation of footer for inspection --in Input file (required): must be an URI or an absolute local path --out Output file (optional, default stdout) From c8ec99bcd16c68e9de337e524c8f2f8bb44bb77b Mon Sep 17 00:00:00 2001 From: Alkis Evlogimenos Date: Mon, 22 Jul 2024 14:05:46 +0000 Subject: [PATCH 26/26] invert the condition to make it more readable and ends comments with fullstop --- cpp/tools/parquet/parquet_dump_footer.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/tools/parquet/parquet_dump_footer.cc b/cpp/tools/parquet/parquet_dump_footer.cc index fe95541c35ad4..c7a4b78fdd823 100644 --- a/cpp/tools/parquet/parquet_dump_footer.cc +++ b/cpp/tools/parquet/parquet_dump_footer.cc @@ -47,7 +47,7 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { std::cerr << "File too short: " << in << "\n"; return 3; } - // Do a first opportunistic read of up to 1 MiB to try and get the entire footer + // First do an opportunistic read of up to 1 MiB to try and get the entire footer. int64_t tail_len = std::min(file_len, int64_t{1} << 20); std::string tail; tail.resize(tail_len); @@ -58,8 +58,11 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { return 4; } uint32_t metadata_len = ReadLE32(data + tail_len - 8); - if (metadata_len > tail_len - 8) { - // The footer is larger than the initial read, read again the exact size + if (tail_len >= metadata_len + 8) { + // The footer is entirely in the initial read. Trim to size. + tail = tail.substr(tail_len - (metadata_len + 8)); + } else { + // The footer is larger than the initial read, read again the exact size. if (metadata_len > file_len) { std::cerr << "File too short: " << in << "\n"; return 5; @@ -68,9 +71,6 @@ int DoIt(std::string in, bool scrub, bool json, std::string out) { tail.resize(tail_len); data = tail.data(); file->ReadAt(file_len - tail_len, tail_len, data).ValueOrDie(); - } else { - // Keep the footer + the magic bytes - tail = tail.substr(tail_len - (metadata_len + 8)); } auto md = FileMetaData::Make(tail.data(), &metadata_len); std::string ser = md->SerializeUnencrypted(scrub, json); @@ -125,7 +125,7 @@ int main(int argc, char** argv) { if (i + 1 >= argc) return PrintHelp(); out = argv[++i]; } else { - // Unknown option + // Unknown option. return PrintHelp(); } }