From 3c1f0ad1c44b53d6c17fd7c1d578ec61616b5661 Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Wed, 3 Jul 2024 12:03:59 -0400 Subject: [PATCH] clp-s: Correctly report uncompressed size of archives during archive-splitting (fixes #469). (#463) --- components/core/src/clp_s/JsonFileIterator.cpp | 10 ++++++++++ components/core/src/clp_s/JsonFileIterator.hpp | 8 ++++++++ components/core/src/clp_s/JsonParser.cpp | 12 +++++++----- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index 5fffcc8f9..ad6d16cd0 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -156,4 +156,14 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i } while (read_new_json()); return false; } + +size_t JsonFileIterator::get_num_bytes_consumed() { + // If there are more documents left in the current buffer account for how much of the + // buffer has been consumed, otherwise report the total number of bytes read so that we + // capture trailing whitespace. + if (m_doc_it != m_stream.end()) { + return m_bytes_read - (m_buf_occupied - m_next_document_position); + } + return m_bytes_read; +} } // namespace clp_s diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 51422963a..b8db3f4f2 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -51,6 +51,14 @@ class JsonFileIterator { */ [[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; } + /** + * Note: this method can not be const because checking if a simdjson iterator is at the end + * of a document stream is non-const. + * + * @return total number of bytes consumed from the file via get_json + */ + [[nodiscard]] size_t get_num_bytes_consumed(); + /** * @return the last error code encountered when iterating over the json file */ diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index d73643a64..26ec3d7b4 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -439,7 +439,7 @@ bool JsonParser::parse() { simdjson::ondemand::document_stream::iterator json_it; m_num_messages = 0; - size_t last_num_bytes_read = 0; + size_t last_num_bytes_consumed = 0; while (json_file_iterator.get_json(json_it)) { m_current_schema.clear(); @@ -463,9 +463,11 @@ bool JsonParser::parse() { ->append_message(current_schema_id, m_current_schema, m_current_parsed_message); if (m_archive_writer->get_data_size() >= m_target_encoded_size) { - size_t num_bytes_read = json_file_iterator.get_num_bytes_read(); - m_archive_writer->increment_uncompressed_size(num_bytes_read - last_num_bytes_read); - last_num_bytes_read = num_bytes_read; + size_t num_bytes_read = json_file_iterator.get_num_bytes_consumed(); + m_archive_writer->increment_uncompressed_size( + num_bytes_read - last_num_bytes_consumed + ); + last_num_bytes_consumed = num_bytes_read; split_archive(); } @@ -473,7 +475,7 @@ bool JsonParser::parse() { } m_archive_writer->increment_uncompressed_size( - json_file_iterator.get_num_bytes_read() - last_num_bytes_read + json_file_iterator.get_num_bytes_read() - last_num_bytes_consumed ); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) {