From 9506088c0bd8db92ab69a31dea499c2bdc786290 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 16 Sep 2024 11:52:14 -0400
Subject: [PATCH 01/15] ir->archive->json working

---
 components/core/CMakeLists.txt                |   6 +
 components/core/src/clp_s/CMakeLists.txt      |  28 ++
 .../core/src/clp_s/CommandLineArguments.cpp   | 272 +++++++++++++-
 .../core/src/clp_s/CommandLineArguments.hpp   |   8 +-
 components/core/src/clp_s/JsonParser.cpp      | 332 ++++++++++++++++++
 components/core/src/clp_s/JsonParser.hpp      |  51 +++
 components/core/src/clp_s/clp-s.cpp           |  68 +++-
 7 files changed, 761 insertions(+), 4 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index c4f84570c..2be40ac75 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -266,6 +266,12 @@ set(SOURCE_FILES_clp_s_unitTest
     src/clp_s/TimestampPattern.hpp
     src/clp_s/Utils.cpp
     src/clp_s/Utils.hpp
+    src/clp_s/ZstdCompressor.hpp
+    src/clp_s/ZstdCompressor.cpp
+    src/clp_s/ZstdDecompressor.hpp
+    src/clp_s/ZstdDecompressor.cpp
+    src/clp_s/FileWriter.cpp
+    src/clp_s/FileReader.cpp
 )
 
 set(SOURCE_FILES_unitTest
diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index c8cf08b22..456f53c20 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -28,6 +28,34 @@ set(
         ../clp/TraceableException.hpp
         ../clp/WriterInterface.cpp
         ../clp/WriterInterface.hpp
+        ../clp/ffi/ir_stream/Deserializer.hpp
+        ../clp/ffi/ir_stream/Deserializer.cpp
+        ../clp/BufferReader.hpp
+        ../clp/BufferReader.cpp
+        ../clp/type_utils.hpp
+        ../clp/ffi/Value.hpp
+        ../clp/ErrorCode.hpp
+        ../clp/ir/EncodedTextAst.hpp
+        ../clp/ir/EncodedTextAst.cpp
+        ../clp/ir/types.hpp
+        ../clp/ReaderInterface.hpp
+        ../clp/ReaderInterface.cpp
+        ../clp/time_types.hpp
+        ../clp/type_utils.hpp
+        ../clp/ffi/KeyValuePairLogEvent.hpp
+        ../clp/ffi/KeyValuePairLogEvent.cpp
+        ../clp/ffi/SchemaTree.hpp
+        ../clp/ffi/SchemaTree.cpp
+        ../clp/ffi/SchemaTreeNode.hpp
+        ../clp/ffi/Value.hpp
+        ../clp/ffi/ir_stream/decoding_methods.hpp
+        ../clp/ffi/ir_stream/protocol_constants.hpp
+        ../clp/ffi/ir_stream/utils.hpp
+        ../clp/ffi/ir_stream/decoding_methods.cpp
+        ../clp/ffi/utils.hpp
+        ../clp/ffi/utils.cpp
+        ../clp/utf8_utils.hpp
+        ../clp/utf8_utils.cpp
 )
 
 set(
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 4cfe017ac..06c319057 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -106,11 +106,15 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                 std::cerr << "  c - compress" << std::endl;
                 std::cerr << "  x - decompress" << std::endl;
                 std::cerr << "  s - search" << std::endl;
+                std::cerr << "  r - JSON to IR Format" << std::endl;
+                std::cerr << "  i - compress IR format" << std::endl;
                 std::cerr << std::endl;
                 std::cerr << "Try "
                           << " c --help OR"
                           << " x --help OR"
-                          << " s --help for command-specific details." << std::endl;
+                          << " s --help OR"
+                          << " r --help OR"
+                          << " i --help for command-specific details." << std::endl;
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -125,6 +129,8 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             case (char)Command::Compress:
             case (char)Command::Extract:
             case (char)Command::Search:
+            case (char)Command::Json_To_IR:
+            case (char)Command::IR_Compress:
                 m_command = (Command)command_input;
                 break;
             default:
@@ -264,7 +270,259 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 m_metadata_db_config = std::move(metadata_db_config);
             }
-        } else if ((char)Command::Extract == command_input) {
+        } else if (Command::IR_Compress == m_command) {
+            po::options_description compression_positional_options;
+            // clang-format off
+             compression_positional_options.add_options()(
+                     "archives-dir",
+                     po::value<std::string>(&m_archives_dir)->value_name("DIR"),
+                     "output directory"
+             )(
+                     "input-paths",
+                     po::value<std::vector<std::string>>(&m_file_paths)->value_name("PATHS"),
+                     "input paths"
+             );
+            // clang-format on
+
+            po::options_description compression_options("Compression options");
+            std::string metadata_db_config_file_path;
+            std::string input_path_list_file_path;
+            // clang-format off
+            compression_options.add_options()(
+                    "compression-level",
+                    po::value<int>(&m_compression_level)->value_name("LEVEL")->
+                        default_value(m_compression_level),
+                    "1 (fast/low compression) to 9 (slow/high compression)."
+            )(
+                    "target-encoded-size",
+                    po::value<size_t>(&m_target_encoded_size)->value_name("TARGET_ENCODED_SIZE")->
+                        default_value(m_target_encoded_size),
+                    "Target size (B) for the dictionaries and encoded messages before a new "
+                    "archive is created."
+            )(
+                    "max-document-size",
+                    po::value<size_t>(&m_max_document_size)->value_name("DOC_SIZE")->
+                        default_value(m_max_document_size),
+                    "Maximum allowed size (B) for a single document before compression fails."
+            )(
+                    "timestamp-key",
+                    po::value<std::string>(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")->
+                        default_value(m_timestamp_key),
+                    "Path (e.g. x.y) for the field containing the log event's timestamp."
+            )(
+                    "db-config-file",
+                    po::value<std::string>(&metadata_db_config_file_path)->value_name("FILE")->
+                    default_value(metadata_db_config_file_path),
+                    "Global metadata DB YAML config"
+            )(
+                    "files-from,f",
+                    po::value<std::string>(&input_path_list_file_path)
+                            ->value_name("FILE")
+                            ->default_value(input_path_list_file_path),
+                    "Compress files specified in FILE"
+            )(
+                    "print-archive-stats",
+                    po::bool_switch(&m_print_archive_stats),
+                    "Print statistics (json) about the archive after it's compressed."
+            )/*(
+                    "structurize-arrays",
+                    po::bool_switch(&m_structurize_arrays),
+                    "Structurize arrays instead of compressing them as clp strings."
+            ) */;
+            // clang-format on
+
+            po::positional_options_description positional_options;
+            positional_options.add("archives-dir", 1);
+            positional_options.add("input-paths", -1);
+
+            po::options_description all_compression_options;
+            all_compression_options.add(compression_options);
+            all_compression_options.add(compression_positional_options);
+
+            std::vector<std::string> unrecognized_options
+                    = po::collect_unrecognized(parsed.options, po::include_positional);
+            unrecognized_options.erase(unrecognized_options.begin());
+            po::store(
+                    po::command_line_parser(unrecognized_options)
+                            .options(all_compression_options)
+                            .positional(positional_options)
+                            .run(),
+                    parsed_command_line_options
+            );
+            po::notify(parsed_command_line_options);
+
+            if (parsed_command_line_options.count("help")) {
+                print_IR_compression_usage();
+
+                /* std::cerr << "Examples:" << std::endl;
+                std::cerr << "  # Compress file1.json and dir1 into archives-dir" << std::endl;
+                std::cerr << "  " << m_program_name << " c archives-dir file1.json dir1"
+                          << std::endl; */
+
+                po::options_description visible_options;
+                visible_options.add(general_options);
+                visible_options.add(compression_options);
+                std::cerr << visible_options << '\n';
+                return ParsingResult::InfoCommand;
+            }
+
+            if (m_archives_dir.empty()) {
+                throw std::invalid_argument("No archives directory specified.");
+            }
+
+            if (false == input_path_list_file_path.empty()) {
+                if (false == read_paths_from_file(input_path_list_file_path, m_file_paths)) {
+                    SPDLOG_ERROR("Failed to read paths from {}", input_path_list_file_path);
+                    return ParsingResult::Failure;
+                }
+            }
+
+            if (m_file_paths.empty()) {
+                throw std::invalid_argument("No input paths specified.");
+            }
+
+            // Parse and validate global metadata DB config
+            if (false == metadata_db_config_file_path.empty()) {
+                clp::GlobalMetadataDBConfig metadata_db_config;
+                try {
+                    metadata_db_config.parse_config_file(metadata_db_config_file_path);
+                } catch (std::exception& e) {
+                    SPDLOG_ERROR("Failed to validate metadata database config - {}.", e.what());
+                    return ParsingResult::Failure;
+                }
+
+                if (clp::GlobalMetadataDBConfig::MetadataDBType::MySQL
+                    != metadata_db_config.get_metadata_db_type())
+                {
+                    SPDLOG_ERROR(
+                            "Invalid metadata database type for {}; only supported type is MySQL.",
+                            m_program_name
+                    );
+                    return ParsingResult::Failure;
+                }
+
+                m_metadata_db_config = std::move(metadata_db_config);
+            }
+        }else if ((char)Command::Json_To_IR == command_input) {
+            po::options_description compression_positional_options;
+            // clang-format off
+             compression_positional_options.add_options()(
+                     "ir-dir",
+                     po::value<std::string>(&m_archives_dir)->value_name("DIR"),
+                     "output directory"
+             )(
+                     "input-paths",
+                     po::value<std::vector<std::string>>(&m_file_paths)->value_name("PATHS"),
+                     "input paths"
+             );
+            // clang-format on
+
+            po::options_description compression_options("Compression options");
+            std::string metadata_db_config_file_path;
+            std::string input_path_list_file_path;
+            // clang-format off
+            compression_options.add_options()(
+                    "compression-level",
+                    po::value<int>(&m_compression_level)->value_name("LEVEL")->
+                        default_value(m_compression_level),
+                    "1 (fast/low compression) to 9 (slow/high compression)."
+            )(
+                    "max-document-size",
+                    po::value<size_t>(&m_max_document_size)->value_name("DOC_SIZE")->
+                        default_value(m_max_document_size),
+                    "Maximum allowed size (B) for a single document before ir generation fails."
+            )(
+                    "timestamp-key",
+                    po::value<std::string>(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")->
+                        default_value(m_timestamp_key),
+                    "Path (e.g. x.y) for the field containing the log event's timestamp."
+            )(
+                    "db-config-file",
+                    po::value<std::string>(&metadata_db_config_file_path)->value_name("FILE")->
+                    default_value(metadata_db_config_file_path),
+                    "Global metadata DB YAML config"
+            )(
+                    "files-from,f",
+                    po::value<std::string>(&input_path_list_file_path)
+                            ->value_name("FILE")
+                            ->default_value(input_path_list_file_path),
+                    "Compress files specified in FILE"
+            );
+            // clang-format on
+
+            po::positional_options_description positional_options;
+            positional_options.add("ir-dir", 1);
+            positional_options.add("input-paths", -1);
+
+            po::options_description all_compression_options;
+            all_compression_options.add(compression_options);
+            all_compression_options.add(compression_positional_options);
+
+            std::vector<std::string> unrecognized_options
+                    = po::collect_unrecognized(parsed.options, po::include_positional);
+            unrecognized_options.erase(unrecognized_options.begin());
+            po::store(
+                    po::command_line_parser(unrecognized_options)
+                            .options(all_compression_options)
+                            .positional(positional_options)
+                            .run(),
+                    parsed_command_line_options
+            );
+            po::notify(parsed_command_line_options);
+
+            if (parsed_command_line_options.count("help")) {
+                print_json_to_IR_usage();
+
+                /* std::cerr << "Examples:" << std::endl;
+                std::cerr << "  # Compress file1.json and dir1 into archives-dir" << std::endl;
+                std::cerr << "  " << m_program_name << " c archives-dir file1.json dir1"
+                          << std::endl; */
+
+                po::options_description visible_options;
+                visible_options.add(general_options);
+                visible_options.add(compression_options);
+                std::cerr << visible_options << '\n';
+                return ParsingResult::InfoCommand;
+            }
+
+            if (m_archives_dir.empty()) {
+                throw std::invalid_argument("No IRs directory specified.");
+            }
+
+            if (false == input_path_list_file_path.empty()) {
+                if (false == read_paths_from_file(input_path_list_file_path, m_file_paths)) {
+                    SPDLOG_ERROR("Failed to read paths from {}", input_path_list_file_path);
+                    return ParsingResult::Failure;
+                }
+            }
+
+            if (m_file_paths.empty()) {
+                throw std::invalid_argument("No input paths specified.");
+            }
+
+            // Parse and validate global metadata DB config
+            if (false == metadata_db_config_file_path.empty()) {
+                clp::GlobalMetadataDBConfig metadata_db_config;
+                try {
+                    metadata_db_config.parse_config_file(metadata_db_config_file_path);
+                } catch (std::exception& e) {
+                    SPDLOG_ERROR("Failed to validate metadata database config - {}.", e.what());
+                    return ParsingResult::Failure;
+                }
+
+                if (clp::GlobalMetadataDBConfig::MetadataDBType::MySQL
+                    != metadata_db_config.get_metadata_db_type())
+                {
+                    SPDLOG_ERROR(
+                            "Invalid metadata database type for {}; only supported type is MySQL.",
+                            m_program_name
+                    );
+                    return ParsingResult::Failure;
+                }
+
+                m_metadata_db_config = std::move(metadata_db_config);
+            }
+        }else if ((char)Command::Extract == command_input) {
             po::options_description extraction_options;
             // clang-format off
             extraction_options.add_options()(
@@ -786,4 +1044,14 @@ void CommandLineArguments::print_search_usage() const {
                  " [OUTPUT_HANDLER [OUTPUT_HANDLER_OPTIONS]]"
               << std::endl;
 }
+
+void CommandLineArguments::print_json_to_IR_usage() const {
+    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]"
+              << std::endl;
+}
+
+void CommandLineArguments::print_IR_compression_usage() const {
+    std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]"
+              << std::endl;
+}
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index 0f3d8c556..60c480987 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -26,7 +26,9 @@ class CommandLineArguments {
     enum class Command : char {
         Compress = 'c',
         Extract = 'x',
-        Search = 's'
+        Search = 's',
+        Json_To_IR = 'r',
+        IR_Compress = 'i'
     };
 
     enum class OutputHandlerType : uint8_t {
@@ -157,6 +159,10 @@ class CommandLineArguments {
 
     void print_search_usage() const;
 
+    void print_json_to_IR_usage() const;
+
+    void print_IR_compression_usage() const;
+
     // Variables
     std::string m_program_name;
     Command m_command;
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index a68062958..a56387bfd 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -1,6 +1,7 @@
 #include "JsonParser.hpp"
 
 #include <iostream>
+#include <fstream>
 #include <stack>
 
 #include <simdjson.h>
@@ -37,6 +38,26 @@ JsonParser::JsonParser(JsonParserOption const& option)
     m_archive_writer->open(m_archive_options);
 }
 
+JsonParser::JsonParser(JsonToIRParserOption const& option)
+        : m_num_messages(0),
+          m_max_document_size(option.max_document_size) {
+    if (false == FileUtils::validate_path(option.file_paths)) {
+        exit(1);
+    }
+
+    for (auto& file_path : option.file_paths) {
+        FileUtils::find_all_files(file_path, m_file_paths);
+    }
+
+    m_archive_options.archives_dir = option.irs_dir;
+    m_archive_options.compression_level = option.compression_level;
+    //m_archive_options.print_archive_stats = option.print_archive_stats;
+    m_archive_options.id = m_generator();
+
+    m_archive_writer = std::make_unique<ArchiveWriter>(option.metadata_db);
+    m_archive_writer->open(m_archive_options);
+}
+
 void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_id) {
     ondemand::object_iterator it = line.begin();
     if (it == line.end()) {
@@ -520,6 +541,317 @@ bool JsonParser::parse() {
     return true;
 }
 
+NodeType get_archive_node_type(clp::ffi::SchemaTreeNode const& node, std::pair<clp::ffi::SchemaTreeNode::id_t, std::optional<clp::ffi::Value>> p){
+    //std::cerr << "In get_archive_node_type\n";
+    auto const node_type = node.get_type();
+    //std::cerr << "got ir type\n";
+    //figure out what type the node is in archive node type
+    NodeType archiveNodeType;
+    switch(node_type){
+            case clp::ffi::SchemaTreeNode::Type::Int : 
+                archiveNodeType = NodeType::Integer;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Float : 
+                archiveNodeType = NodeType::Float;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Bool : 
+                archiveNodeType = NodeType::Boolean;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : 
+                archiveNodeType = NodeType::UnstructuredArray;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Str :
+                //std::cerr << "In str\n";
+                if(p.second.value().is<std::string>()){
+                    //maybe special case for date string
+                    archiveNodeType = NodeType::VarString;
+                }else{
+                    archiveNodeType = NodeType::ClpString;
+                }
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Obj :
+                //std::cerr << "In obj\n"; 
+                if(p.second.has_value()){
+                    if(p.second.value().is_null()){
+                        //std::cout << "Found Null\n";
+                        archiveNodeType = NodeType::NullValue;
+                    }else{
+                        archiveNodeType = NodeType::Object;
+                    }
+                }else{
+                    archiveNodeType = NodeType::Object;
+                }
+                break;
+            default : 
+                archiveNodeType = NodeType::Unknown;
+                break;
+            //Do I need to do anything for structured arrays
+    }
+    //std::cerr << "After Switch\n";
+    return archiveNodeType;
+}
+
+//
+int JsonParser::get_archive_node_id(std::map< std::tuple<int, NodeType>, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree){
+    //std::cerr << "In get archive node id\n";
+    std::tuple<int, NodeType> key (irNodeID, archiveNodeType);
+    if(cache.find(key) != cache.end()){
+        //std::cerr << "Found value\n";
+        return cache[key];
+    }
+    auto& currNode =  irTree.get_node(irNodeID);
+    //std::cerr << "Got node\n";
+    int parent_node_id;
+    if(currNode.get_parent_id() == 0){
+        //std::cout << "Hit the root\n";
+        parent_node_id = 0;
+    }else{
+        //std::cerr << "Look for parent id\n";
+        parent_node_id = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree);
+        //std::cerr << "Got parent id\n";
+    }
+    std::string nodeKey = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value();
+    //std::string nodeKey = static_cast<std::string>(validated_key);
+    int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey);
+    //std::cerr << "Added node to archive\n";
+    cache[key] = curr_node_archive_id;
+    //std::cerr << "Added to cache\n";
+    return curr_node_archive_id;
+}
+
+void print_kv_log_event(KeyValuePairLogEvent const& kv){
+    auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
+    std::cout << "number of kv pairs: " << num_kv_pairs << std::endl;
+    auto const& tree = kv.get_schema_tree();
+    for (auto const &pair: kv.get_node_id_value_pairs()){
+        auto const& tree_node = tree.get_node(pair.first);
+        auto const node_type = tree_node.get_type();
+        switch(node_type){
+                case clp::ffi::SchemaTreeNode::Type::Int : std::cout << "Int" << std::endl; break;
+                case clp::ffi::SchemaTreeNode::Type::Float : std::cout << "Float" << std::endl; break;
+                case clp::ffi::SchemaTreeNode::Type::Bool : std::cout << "Bool" << std::endl; break;
+                case clp::ffi::SchemaTreeNode::Type::Str : std::cout << "Str" << std::endl; break;
+                case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : std::cout << "UArray" << std::endl; break;
+                case clp::ffi::SchemaTreeNode::Type::Obj : std::cout << "Obj" << std::endl; break;
+                default : std::cout << "???" << std::endl; break;
+        }
+        
+        if(!pair.second.has_value()){
+                std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n";
+                continue;
+        }
+        if(pair.second.value().is<clp::ffi::value_int_t>()){
+                std::cout << "{INT:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_int_t>() << "}\n";
+        }else if(pair.second.value().is<clp::ffi::value_float_t>()){
+                std::cout << "{FLOAT:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_float_t>() << "}\n";
+        }else if(pair.second.value().is<clp::ffi::value_bool_t>()){
+                std::cout << "{BOOL:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_bool_t>() << "}\n";
+        }else if(pair.second.value().is<std::string>()){
+                std::cout << "{STRING:\t" << pair.first << ": " << pair.second.value().get_immutable_view<std::string>() << "}\n";
+        }else if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
+                std::cout << "{EIGHTByte:\t" << pair.first << ": \n";
+                auto decoded = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse();
+                if(std::nullopt != decoded){
+                        std:: cout << "\t Decoded & Unparsed: "<< decoded.value()<< std::endl;
+                }else{
+                        std::cout << "\tNULL\n";
+                }
+                std::cout << "}\n";
+        }else if(pair.second.value().is<clp::ir::FourByteEncodedTextAst>()){
+                std::cout << "{FOURByte:\t" << pair.first << ": \n";
+                auto decoded = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse();
+                if(std::nullopt != decoded){
+                        std:: cout << "\tDecoded & Unparsed: "<< decoded.value() << std::endl;
+                }else{
+                        std::cout << "\tNULL\n";
+                }
+                std::cout << "}\n";
+        }else{
+                std::cout << "Unknown Type:\t" << pair.first << "\n";
+        }
+
+    }
+    std::cout << "after for loop\n\n\n";
+}
+
+void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std::tuple<int, NodeType>,  int>& cache){
+    auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
+    clp::ffi::SchemaTree const& tree = kv.get_schema_tree();
+    //std::cerr << "In parse\n";
+    for (auto const& pair: kv.get_node_id_value_pairs()){
+        //std::cerr << "In for loop\n";
+        clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first);
+        //std::cerr << "After get node\n";
+        NodeType archiveNodeType = get_archive_node_type(tree_node, pair);
+        //std::cerr << "After get archive node type\n";
+        int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree);
+        //std::cerr << "After get_archive_node_id\n";
+        //std::cerr << node_id << std::endl;
+        switch(archiveNodeType){
+            case NodeType::Integer :{
+                int64_t i64_value = pair.second.value().get_immutable_view<clp::ffi::value_int_t>();
+                m_current_parsed_message.add_value(node_id, i64_value);
+            }break;
+            case NodeType::Float :{
+                double d_value = pair.second.value().get_immutable_view<clp::ffi::value_float_t>();
+                m_current_parsed_message.add_value(node_id, d_value);
+            }break;
+            case NodeType::Boolean :{
+                bool b_value = pair.second.value().get_immutable_view<clp::ffi::value_bool_t>();
+                m_current_parsed_message.add_value(node_id, b_value);
+            }break; 
+            case NodeType::VarString :{
+                std::string str = clp::ffi::validate_and_escape_utf8_string(pair.second.value().get_immutable_view<std::string>()).value();
+                m_current_parsed_message.add_value(node_id, str);
+            }break;
+            case NodeType::ClpString :{
+                //auto const node_type = tree_node.get_type();
+                std::string encoded_str;
+                ///Do I need to reparse these? Do I need to convert 4bytes to 8bytes .... how?
+                if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
+                    std::string decodedValue = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse().value();
+                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value();
+                }else{
+                    std::string decodedValue = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse().value();
+                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value();
+                }
+                m_current_parsed_message.add_value(node_id, encoded_str);
+            }break;
+            case NodeType::UnstructuredArray :{
+                //auto const encoded_type = tree_node.get_type();
+                std::string array_str;
+                if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
+                    array_str = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse().value();
+                }else{
+                    array_str = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse().value();
+                }
+                m_current_parsed_message.add_value(node_id, array_str);
+                break;
+            }
+            default : 
+                //Don't need to add value for obj or null
+                break;
+        }
+        m_current_schema.insert_ordered(node_id);
+    } 
+
+    int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema);
+    m_current_parsed_message.set_id(current_schema_id);
+    m_archive_writer->append_message(current_schema_id, m_current_schema, m_current_parsed_message);
+    return;
+}
+
+bool JsonParser::parse_from_IR() {
+    std::map<std::tuple<int, NodeType>,  int> id_conversion_cache;
+    m_archive_writer->add_node(-1, NodeType::Unknown, "root");
+
+    for (auto& file_path : m_file_paths) {
+        std::vector<char> ir_buf;
+        //Make function from reading in this file
+        char temp_ir_buf[1000];
+        //char* new_ir_buf = (char *) malloc(ir_buf.size());
+        FileReader infile;
+        infile.open(file_path);
+        if(false == infile.is_open()){
+            m_archive_writer->close();
+            return false;
+        }
+        int fsize = std::filesystem::file_size(file_path);
+        if(0  == fsize){
+            m_archive_writer->close();
+            return false;
+        }
+        ZstdDecompressor zd;
+        zd.open(infile, fsize);
+        size_t num_bytes_read = 0;
+        do{
+            num_bytes_read = 0;
+            zd.try_read(temp_ir_buf, 1000, num_bytes_read);
+            if (num_bytes_read != 0){
+                ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read);
+            }
+        }while (num_bytes_read == 1000);
+        zd.close();
+        infile.close(); 
+        /* std::cout << "IR BUFFER\n";
+        for (size_t i = 0; i < ir_buf.size(); ++i) {
+            std::cout << ir_buf.data()[i];
+        }
+        std::cout << "\n\n\n"; */
+        BufferReader reader{size_checked_pointer_cast<char>(ir_buf.data()), ir_buf.size()};
+        char const* p;
+        size_t p_size;
+        //reader.peek_buffer(p, p_size);
+        //std::cout << "Num Bytes in buffer left: " << p_size << std::endl;
+        //for(int z = 0; z < p_size; z++){
+        //    std::cout << p[z];
+        //}
+        //std::cout << std::endl;
+
+        auto deserializer_result = Deserializer::create(reader);
+        if(deserializer_result.has_error()){
+            m_archive_writer->close();
+            return false;
+        }
+        auto& deserializer = deserializer_result.value();
+
+
+        m_num_messages = 0;
+        //size_t bytes_consumed_up_to_prev_archive = 0;
+        //size_t bytes_consumed_up_to_prev_record = 0;
+        int iterations = 2;
+        do{
+            iterations--;
+            //std::cerr << "In do while loop\n";
+            auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader);
+            //std::cerr << "After deserialize\n";
+
+            //reader.peek_buffer(p, p_size);
+            //std::cout << "Num Bytes in buffer left: " << p_size << std::endl;
+            //for(int z = 0; z < p_size; z++){
+            //    std::cout << p[z];
+            //}
+            //std::cout << std::endl;
+
+            if(kv_log_event_result.has_error()){
+                //std::cerr << "has error\n";
+                if(kv_log_event_result.error() == std::errc::no_message_available || kv_log_event_result.error() == std::errc::result_out_of_range){
+                    //std::cerr << "Breaking out of do while loop\n";
+                    break;
+                }
+            }
+            //std::cerr << "After error check\n";
+            m_current_schema.clear();
+            auto const& kv_log_event = kv_log_event_result.value();
+
+            //print_kv_log_event(kv_log_event);
+            //std::cerr << "before parse\n";
+            parse_kv_log_event(kv_log_event, id_conversion_cache);
+            //std::cerr << "After parse\n";
+            m_num_messages++;
+            //Implement archive splitting and size tracking
+            /* bytes_consumed_up_to_prev_record = json_file_iterator.get_num_bytes_consumed();
+            if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
+                m_archive_writer->increment_uncompressed_size(
+                        bytes_consumed_up_to_prev_record - bytes_consumed_up_to_prev_archive
+                );
+                bytes_consumed_up_to_prev_archive = bytes_consumed_up_to_prev_record;
+                split_archive();
+            } */
+
+            m_current_parsed_message.clear();
+
+        } while(1);//while(iterations > 0);
+        //std::cout << "Out of do while loop\n";
+
+    }
+    return true;
+}
+
+bool JsonParser::parse_to_IR(){
+    return true;
+}
+
 void JsonParser::store() {
     m_archive_writer->close();
 }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 84aa27fef..90d3bef39 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -22,6 +22,19 @@
 #include "TimestampDictionaryWriter.hpp"
 #include "Utils.hpp"
 #include "ZstdCompressor.hpp"
+#include "../clp/ffi/ir_stream/Deserializer.hpp"
+#include "../clp/BufferReader.hpp"
+#include "../clp/type_utils.hpp"
+#include "../clp/ffi/Value.hpp"
+#include "../clp/ffi/KeyValuePairLogEvent.hpp"
+#include "../clp/ffi/SchemaTree.hpp"
+#include "../clp/ffi/SchemaTreeNode.hpp"
+#include "../clp/ffi/utils.hpp"
+
+using clp::size_checked_pointer_cast;
+using clp::BufferReader;
+using clp::ffi::ir_stream::Deserializer;
+using clp::ffi::KeyValuePairLogEvent;
 
 using namespace simdjson;
 
@@ -38,6 +51,14 @@ struct JsonParserOption {
     std::shared_ptr<clp::GlobalMySQLMetadataDB> metadata_db;
 };
 
+struct JsonToIRParserOption {
+    std::vector<std::string> file_paths;
+    std::string irs_dir;
+    size_t max_document_size;
+    int compression_level;
+    std::shared_ptr<clp::GlobalMySQLMetadataDB> metadata_db;
+};
+
 class JsonParser {
 public:
     class OperationFailed : public TraceableException {
@@ -50,6 +71,8 @@ class JsonParser {
     // Constructor
     explicit JsonParser(JsonParserOption const& option);
 
+    JsonParser(JsonToIRParserOption const& option);
+
     // Destructor
     ~JsonParser() = default;
 
@@ -59,6 +82,18 @@ class JsonParser {
      */
     [[nodiscard]] bool parse();
 
+    /**
+     * Parses the Key Value IR Stream and stores the data in the archive.
+     * @return whether the IR Stream was parsed succesfully
+     */
+    [[nodiscard]] bool parse_from_IR();
+
+    /**
+     * Parses the JSON log messages to the Key Value IR Stream format.
+     * @return whether the JSON was parsed succesfully
+     */
+    [[nodiscard]] bool parse_to_IR();
+
     /**
      * Writes the metadata and archive data to disk.
      */
@@ -74,6 +109,22 @@ class JsonParser {
      */
     void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key);
 
+    /**
+     * Parses a Key Value Log Event
+     * @param kv the key value log event
+     * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes
+     */
+    void parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std::tuple<int, NodeType>, int>& cache);
+
+    /**
+     * Get archive node id for ir node
+     * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes
+     * @param irNodeID
+     * @param irType
+     * @param irTree
+     */
+    int get_archive_node_id(std::map < std::tuple<int, NodeType>,  int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree);
+
     /**
      * Parses an array within a JSON line
      * @param line the JSON array
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 0e0401ad1..5a7da79db 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -48,6 +48,14 @@ namespace {
  */
 bool compress(CommandLineArguments const& command_line_arguments);
 
+/**
+ * Compresses the input IR files specified by the command line arguments into an archive.
+ * @param command_line_arguments
+ * @return Whether compression was successful
+ */
+bool IR_compress(CommandLineArguments const& command_line_arguments);
+
+
 /**
  * Decompresses the archive specified by the given JsonConstructorOption.
  * @param json_constructor_option
@@ -116,6 +124,58 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     return true;
 }
 
+bool IR_compress(CommandLineArguments const& command_line_arguments) {
+    auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
+
+    // Create output directory in case it doesn't exist
+    try {
+        std::filesystem::create_directory(archives_dir.string());
+    } catch (std::exception& e) {
+        SPDLOG_ERROR(
+                "Failed to create archives directory {} - {}",
+                archives_dir.string(),
+                e.what()
+        );
+        return false;
+    }
+
+    clp_s::JsonParserOption option{};
+    option.file_paths = command_line_arguments.get_file_paths();
+    option.archives_dir = archives_dir.string();
+    option.target_encoded_size = command_line_arguments.get_target_encoded_size();
+    //Do I need max_document_size()
+    option.max_document_size = command_line_arguments.get_max_document_size();
+    option.compression_level = command_line_arguments.get_compression_level();
+    option.timestamp_key = command_line_arguments.get_timestamp_key();
+    option.print_archive_stats = command_line_arguments.print_archive_stats();
+    //Is this an option they can make after IR or is that made before and has to be what is in the IR stream already
+    //option.structurize_arrays = command_line_arguments.get_structurize_arrays();
+
+    auto const& db_config_container = command_line_arguments.get_metadata_db_config();
+    if (db_config_container.has_value()) {
+        auto const& db_config = db_config_container.value();
+        option.metadata_db = std::make_shared<clp::GlobalMySQLMetadataDB>(
+                db_config.get_metadata_db_host(),
+                db_config.get_metadata_db_port(),
+                db_config.get_metadata_db_username(),
+                db_config.get_metadata_db_password(),
+                db_config.get_metadata_db_name(),
+                db_config.get_metadata_table_prefix()
+        );
+    }
+
+    clp_s::JsonParser parser(option);
+    if (false == parser.parse_from_IR()) {
+        SPDLOG_ERROR("Encountered error while parsing input");
+        return false;
+    }else{
+        std::cout << "Got True Back\n";
+    }
+    parser.store();
+    std::cout << "stored the archive\n";
+    return true;
+}
+
 void decompress_archive(clp_s::JsonConstructorOption const& json_constructor_option) {
     clp_s::JsonConstructor constructor(json_constructor_option);
     constructor.store();
@@ -263,7 +323,13 @@ int main(int argc, char const* argv[]) {
         if (false == compress(command_line_arguments)) {
             return 1;
         }
-    } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {
+    } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) {
+        if (false == IR_compress(command_line_arguments)) {
+            return 1;
+        }
+    } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) {
+        return 1;
+    }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {
         auto const& archives_dir = command_line_arguments.get_archives_dir();
         if (false == std::filesystem::is_directory(archives_dir)) {
             SPDLOG_ERROR("'{}' is not a directory.", archives_dir);

From b631e98c9e4e9cdbae441e132a4149d48d25fe9a Mon Sep 17 00:00:00 2001
From: AVMatthews <abigail.v.matthews@gmail.com>
Date: Fri, 20 Sep 2024 21:32:22 -0400
Subject: [PATCH 02/15] JSON -> IRV2 functionality exposed

---
 components/core/src/clp_s/CMakeLists.txt      |  11 +-
 .../core/src/clp_s/CommandLineArguments.cpp   |   8 +-
 .../core/src/clp_s/CommandLineArguments.hpp   |   4 +-
 components/core/src/clp_s/JsonParser.cpp      |  55 +++----
 components/core/src/clp_s/JsonParser.hpp      |   9 +-
 components/core/src/clp_s/clp-s.cpp           | 143 +++++++++++++++++-
 6 files changed, 173 insertions(+), 57 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 456f53c20..7b684632c 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -30,6 +30,10 @@ set(
         ../clp/WriterInterface.hpp
         ../clp/ffi/ir_stream/Deserializer.hpp
         ../clp/ffi/ir_stream/Deserializer.cpp
+        ../clp/ffi/ir_stream/Serializer.hpp
+        ../clp/ffi/ir_stream/Serializer.cpp
+        ../clp/ffi/ir_stream/utils.hpp
+        ../clp/ffi/ir_stream/utils.cpp
         ../clp/BufferReader.hpp
         ../clp/BufferReader.cpp
         ../clp/type_utils.hpp
@@ -49,9 +53,12 @@ set(
         ../clp/ffi/SchemaTreeNode.hpp
         ../clp/ffi/Value.hpp
         ../clp/ffi/ir_stream/decoding_methods.hpp
-        ../clp/ffi/ir_stream/protocol_constants.hpp
-        ../clp/ffi/ir_stream/utils.hpp
         ../clp/ffi/ir_stream/decoding_methods.cpp
+        ../clp/ffi/ir_stream/encoding_methods.hpp
+        ../clp/ffi/ir_stream/encoding_methods.cpp
+        ../clp/ir/parsing.hpp
+        ../clp/ir/parsing.cpp
+        ../clp/ffi/ir_stream/protocol_constants.hpp
         ../clp/ffi/utils.hpp
         ../clp/ffi/utils.cpp
         ../clp/utf8_utils.hpp
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 06c319057..25e858d62 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -432,10 +432,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                         default_value(m_max_document_size),
                     "Maximum allowed size (B) for a single document before ir generation fails."
             )(
-                    "timestamp-key",
-                    po::value<std::string>(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")->
-                        default_value(m_timestamp_key),
-                    "Path (e.g. x.y) for the field containing the log event's timestamp."
+                    "encoding-type",
+                    po::value<int>(&m_encoding_type)->value_name("ENCODING_TYPE")->
+                        default_value(m_encoding_type),
+                    "4 (four byte encoding) or 8 (eight byte encoding)"
             )(
                     "db-config-file",
                     po::value<std::string>(&metadata_db_config_file_path)->value_name("FILE")->
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index 60c480987..dedd3bd59 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -62,6 +62,8 @@ class CommandLineArguments {
 
     size_t get_max_document_size() const { return m_max_document_size; }
 
+    int get_encoding_type() const { return m_encoding_type; }
+
     [[nodiscard]] bool print_archive_stats() const { return m_print_archive_stats; }
 
     std::string const& get_mongodb_uri() const { return m_mongodb_uri; }
@@ -179,7 +181,7 @@ class CommandLineArguments {
     bool m_structurize_arrays{false};
     bool m_ordered_decompression{false};
     size_t m_ordered_chunk_size{0};
-
+    int m_encoding_type{8};
     // Metadata db variables
     std::optional<clp::GlobalMetadataDBConfig> m_metadata_db_config;
 
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index a56387bfd..7c9ead88a 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -38,26 +38,6 @@ JsonParser::JsonParser(JsonParserOption const& option)
     m_archive_writer->open(m_archive_options);
 }
 
-JsonParser::JsonParser(JsonToIRParserOption const& option)
-        : m_num_messages(0),
-          m_max_document_size(option.max_document_size) {
-    if (false == FileUtils::validate_path(option.file_paths)) {
-        exit(1);
-    }
-
-    for (auto& file_path : option.file_paths) {
-        FileUtils::find_all_files(file_path, m_file_paths);
-    }
-
-    m_archive_options.archives_dir = option.irs_dir;
-    m_archive_options.compression_level = option.compression_level;
-    //m_archive_options.print_archive_stats = option.print_archive_stats;
-    m_archive_options.id = m_generator();
-
-    m_archive_writer = std::make_unique<ArchiveWriter>(option.metadata_db);
-    m_archive_writer->open(m_archive_options);
-}
-
 void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_id) {
     ondemand::object_iterator it = line.begin();
     if (it == line.end()) {
@@ -744,11 +724,12 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std
 bool JsonParser::parse_from_IR() {
     std::map<std::tuple<int, NodeType>,  int> id_conversion_cache;
     m_archive_writer->add_node(-1, NodeType::Unknown, "root");
-
+    //int fileNum = 0;
     for (auto& file_path : m_file_paths) {
+        std::cout << file_path << std::endl;
         std::vector<char> ir_buf;
         //Make function from reading in this file
-        char temp_ir_buf[1000];
+        char temp_ir_buf[10000];
         //char* new_ir_buf = (char *) malloc(ir_buf.size());
         FileReader infile;
         infile.open(file_path);
@@ -766,11 +747,11 @@ bool JsonParser::parse_from_IR() {
         size_t num_bytes_read = 0;
         do{
             num_bytes_read = 0;
-            zd.try_read(temp_ir_buf, 1000, num_bytes_read);
+            zd.try_read(temp_ir_buf, 10000, num_bytes_read);
             if (num_bytes_read != 0){
                 ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read);
             }
-        }while (num_bytes_read == 1000);
+        }while (num_bytes_read == 10000);
         zd.close();
         infile.close(); 
         /* std::cout << "IR BUFFER\n";
@@ -799,9 +780,9 @@ bool JsonParser::parse_from_IR() {
         m_num_messages = 0;
         //size_t bytes_consumed_up_to_prev_archive = 0;
         //size_t bytes_consumed_up_to_prev_record = 0;
-        int iterations = 2;
+        //int iterations = 2;
         do{
-            iterations--;
+            //iterations--;
             //std::cerr << "In do while loop\n";
             auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader);
             //std::cerr << "After deserialize\n";
@@ -825,33 +806,31 @@ bool JsonParser::parse_from_IR() {
             auto const& kv_log_event = kv_log_event_result.value();
 
             //print_kv_log_event(kv_log_event);
-            //std::cerr << "before parse\n";
+            /*if (fileNum > 0){
+                std::cout << "before parse\n";
+                print_kv_log_event(kv_log_event);
+            }*/
             parse_kv_log_event(kv_log_event, id_conversion_cache);
             //std::cerr << "After parse\n";
             m_num_messages++;
             //Implement archive splitting and size tracking
-            /* bytes_consumed_up_to_prev_record = json_file_iterator.get_num_bytes_consumed();
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
-                m_archive_writer->increment_uncompressed_size(
-                        bytes_consumed_up_to_prev_record - bytes_consumed_up_to_prev_archive
-                );
-                bytes_consumed_up_to_prev_archive = bytes_consumed_up_to_prev_record;
+                std::cerr << "Splitting Archive\n\n";
+                id_conversion_cache.clear();
+                m_archive_writer->add_node(-1, NodeType::Unknown, "root");
                 split_archive();
-            } */
+            }
 
             m_current_parsed_message.clear();
 
         } while(1);//while(iterations > 0);
         //std::cout << "Out of do while loop\n";
-
+        id_conversion_cache.clear();
+        //fileNum++;
     }
     return true;
 }
 
-bool JsonParser::parse_to_IR(){
-    return true;
-}
-
 void JsonParser::store() {
     m_archive_writer->close();
 }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 90d3bef39..6927a178d 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -30,6 +30,7 @@
 #include "../clp/ffi/SchemaTree.hpp"
 #include "../clp/ffi/SchemaTreeNode.hpp"
 #include "../clp/ffi/utils.hpp"
+#include "../clp/ir/types.hpp"
 
 using clp::size_checked_pointer_cast;
 using clp::BufferReader;
@@ -56,7 +57,7 @@ struct JsonToIRParserOption {
     std::string irs_dir;
     size_t max_document_size;
     int compression_level;
-    std::shared_ptr<clp::GlobalMySQLMetadataDB> metadata_db;
+    int encoding;
 };
 
 class JsonParser {
@@ -88,12 +89,6 @@ class JsonParser {
      */
     [[nodiscard]] bool parse_from_IR();
 
-    /**
-     * Parses the JSON log messages to the Key Value IR Stream format.
-     * @return whether the JSON was parsed succesfully
-     */
-    [[nodiscard]] bool parse_to_IR();
-
     /**
      * Writes the metadata and archive data to disk.
      */
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 5a7da79db..f4c6a5b95 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -5,6 +5,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <fstream>
 
 #include <json/single_include/nlohmann/json.hpp>
 #include <mongocxx/instance.hpp>
@@ -33,12 +34,16 @@
 #include "TimestampPattern.hpp"
 #include "TraceableException.hpp"
 #include "Utils.hpp"
+#include "FileWriter.hpp"
+#include "ZstdCompressor.hpp"
+#include "../clp/ffi/ir_stream/Serializer.hpp"
 
 using namespace clp_s::search;
 using clp_s::cArchiveFormatDevelopmentVersionFlag;
 using clp_s::cEpochTimeMax;
 using clp_s::cEpochTimeMin;
 using clp_s::CommandLineArguments;
+using clp::ffi::ir_stream::Serializer;
 
 namespace {
 /**
@@ -55,7 +60,6 @@ bool compress(CommandLineArguments const& command_line_arguments);
  */
 bool IR_compress(CommandLineArguments const& command_line_arguments);
 
-
 /**
  * Decompresses the archive specified by the given JsonConstructorOption.
  * @param json_constructor_option
@@ -124,6 +128,136 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     return true;
 }
 
+template <typename encoded_variable_t>
+auto flush_and_clear_serializer_buffer(
+        Serializer<encoded_variable_t>& serializer,
+        std::vector<int8_t>& byte_buf
+) -> void {
+    auto const view{serializer.get_ir_buf_view()};
+    byte_buf.insert(byte_buf.cend(), view.begin(), view.end());
+    serializer.clear_ir_buf();
+}
+
+template <typename encoded_variable_t>
+auto unpack_and_serialize_msgpack_bytes(
+        std::vector<uint8_t> const& msgpack_bytes,
+        Serializer<encoded_variable_t>& serializer
+) -> bool {
+    auto const msgpack_obj_handle{msgpack::unpack(
+            clp::size_checked_pointer_cast<char const>(msgpack_bytes.data()),
+            msgpack_bytes.size()
+    )};
+    auto const msgpack_obj{msgpack_obj_handle.get()};
+    if (msgpack::type::MAP != msgpack_obj.type) {
+        return false;
+    }
+    return serializer.serialize_msgpack_map(msgpack_obj.via.map);
+}
+
+template <typename T>
+auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){
+    //std::cout << "Running Serializer\n";
+    auto result{Serializer<T>::create()};
+    if (result.has_error()){
+        SPDLOG_ERROR("Failed to create Serializer");
+        return false;
+    }
+    auto& serializer{result.value()};
+    std::vector<int8_t> ir_buf;
+    flush_and_clear_serializer_buffer(serializer, ir_buf);
+
+    std::ifstream inFile;
+    inFile.open(path, std::ifstream::in); 
+    //std::cout << "Opened Input file\n";
+
+    std::string outPath = "";
+    int index = path.find_last_of('/');
+    if(std::string::npos == index){
+        outPath = option.irs_dir + "/" + path + ".ir";
+    }else{
+        outPath = option.irs_dir + "/" + path.substr(index, path.length()-index) + ".ir";
+    }
+    clp_s::FileWriter outFile;
+    //std::cout << outPath << "\n";
+    outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting);
+    clp_s::ZstdCompressor zc;
+    zc.open(outFile, option.compression_level);
+
+    std::string line; 
+    size_t totalSize = 0;
+
+    if (inFile.is_open()) { 
+        while (getline(inFile, line)) { 
+                auto j_obj = nlohmann::json::parse(line);
+                unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer);
+                flush_and_clear_serializer_buffer(serializer, ir_buf);
+                if(ir_buf.size() >= 1000000000){
+                        totalSize = totalSize + ir_buf.size();
+                        zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
+                        zc.flush();
+                        ir_buf.clear();
+                }
+        }
+        totalSize = totalSize + ir_buf.size(); 
+        zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
+        zc.flush();
+        ir_buf.clear();
+        inFile.close(); 
+        zc.close();
+        outFile.close();
+    } 
+
+    return true;
+}
+
+bool generate_IR(CommandLineArguments const& command_line_arguments){
+    auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
+
+    // Create output directory in case it doesn't exist
+    try {
+        std::filesystem::create_directory(irs_dir.string());
+    } catch (std::exception& e) {
+        SPDLOG_ERROR(
+                "Failed to create archives directory {} - {}",
+                irs_dir.string(),
+                e.what()
+        );
+        return false;
+    }
+    clp_s::JsonToIRParserOption option{};
+    option.file_paths = command_line_arguments.get_file_paths();
+    option.irs_dir = irs_dir.string();
+    //std::cout << "IRs dir: " << option.irs_dir << std::endl;
+    option.max_document_size = command_line_arguments.get_max_document_size();
+    option.compression_level = command_line_arguments.get_compression_level();
+    option.encoding = command_line_arguments.get_encoding_type();
+    //std::cout << "encoding type: " << static_cast<int>(option.encoding) << std::endl;
+
+    if (false == clp_s::FileUtils::validate_path(option.file_paths)) {
+        exit(1);
+    }
+
+    std::vector<std::string> all_file_paths;
+    for (auto& file_path : option.file_paths) {
+        clp_s::FileUtils::find_all_files(file_path, all_file_paths);
+    }
+
+    for (auto& path : all_file_paths) {
+        bool success;
+        if (option.encoding == 4){
+            //std::cout << "four byte\n";
+            success = run_serializer<int32_t>(option, path);
+        }else{
+            //std::cout << "eight byte\n";
+            success = run_serializer<int64_t>(option, path);
+        }
+        if (false == success){
+            return false;
+        }
+    }
+    return true;
+}
+
 bool IR_compress(CommandLineArguments const& command_line_arguments) {
     auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
 
@@ -168,11 +302,8 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) {
     if (false == parser.parse_from_IR()) {
         SPDLOG_ERROR("Encountered error while parsing input");
         return false;
-    }else{
-        std::cout << "Got True Back\n";
     }
     parser.store();
-    std::cout << "stored the archive\n";
     return true;
 }
 
@@ -328,7 +459,9 @@ int main(int argc, char const* argv[]) {
             return 1;
         }
     } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) {
-        return 1;
+        if (false == generate_IR(command_line_arguments)) {
+            return 1;
+        }
     }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {
         auto const& archives_dir = command_line_arguments.get_archives_dir();
         if (false == std::filesystem::is_directory(archives_dir)) {

From 83084c004cdde44ae13da2174347681652c9e315 Mon Sep 17 00:00:00 2001
From: AVMatthews <abigail.v.matthews@gmail.com>
Date: Sat, 21 Sep 2024 06:27:45 -0400
Subject: [PATCH 03/15] Linting and some code clean up

---
 .../core/src/clp_s/CommandLineArguments.cpp   |  28 +-
 components/core/src/clp_s/JsonParser.cpp      | 393 +++++++++---------
 components/core/src/clp_s/JsonParser.hpp      |  38 +-
 components/core/src/clp_s/clp-s.cpp           |  87 ++--
 4 files changed, 275 insertions(+), 271 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 25e858d62..0435cd3f6 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -324,11 +324,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "print-archive-stats",
                     po::bool_switch(&m_print_archive_stats),
                     "Print statistics (json) about the archive after it's compressed."
-            )/*(
-                    "structurize-arrays",
-                    po::bool_switch(&m_structurize_arrays),
-                    "Structurize arrays instead of compressing them as clp strings."
-            ) */;
+            );
             // clang-format on
 
             po::positional_options_description positional_options;
@@ -354,10 +350,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             if (parsed_command_line_options.count("help")) {
                 print_IR_compression_usage();
 
-                /* std::cerr << "Examples:" << std::endl;
-                std::cerr << "  # Compress file1.json and dir1 into archives-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " c archives-dir file1.json dir1"
-                          << std::endl; */
+                std::cerr << "Examples:" << std::endl;
+                std::cerr << "  # Compress file1.ir and dir1 into archives-dir" << std::endl;
+                std::cerr << "  " << m_program_name << " i archives-dir file1.ir dir1"
+                          << std::endl;
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -403,7 +399,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 m_metadata_db_config = std::move(metadata_db_config);
             }
-        }else if ((char)Command::Json_To_IR == command_input) {
+        } else if ((char)Command::Json_To_IR == command_input) {
             po::options_description compression_positional_options;
             // clang-format off
              compression_positional_options.add_options()(
@@ -473,10 +469,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             if (parsed_command_line_options.count("help")) {
                 print_json_to_IR_usage();
 
-                /* std::cerr << "Examples:" << std::endl;
-                std::cerr << "  # Compress file1.json and dir1 into archives-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " c archives-dir file1.json dir1"
-                          << std::endl; */
+                std::cerr << "Examples:" << std::endl;
+                std::cerr << "  # Parse file1.json and dir1 into irs-dir" << std::endl;
+                std::cerr << "  " << m_program_name << " r irs-dir file1.json dir1"
+                          << std::endl;
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -522,7 +518,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 m_metadata_db_config = std::move(metadata_db_config);
             }
-        }else if ((char)Command::Extract == command_input) {
+        } else if ((char)Command::Extract == command_input) {
             po::options_description extraction_options;
             // clang-format off
             extraction_options.add_options()(
@@ -1046,7 +1042,7 @@ void CommandLineArguments::print_search_usage() const {
 }
 
 void CommandLineArguments::print_json_to_IR_usage() const {
-    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]"
+    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]"
               << std::endl;
 }
 
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 7c9ead88a..6647d3d7a 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -1,7 +1,7 @@
 #include "JsonParser.hpp"
 
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <stack>
 
 #include <simdjson.h>
@@ -521,199 +521,233 @@ bool JsonParser::parse() {
     return true;
 }
 
-NodeType get_archive_node_type(clp::ffi::SchemaTreeNode const& node, std::pair<clp::ffi::SchemaTreeNode::id_t, std::optional<clp::ffi::Value>> p){
-    //std::cerr << "In get_archive_node_type\n";
+NodeType get_archive_node_type(
+        clp::ffi::SchemaTreeNode const& node,
+        std::pair<clp::ffi::SchemaTreeNode::id_t, std::optional<clp::ffi::Value>> p
+) {
     auto const node_type = node.get_type();
-    //std::cerr << "got ir type\n";
-    //figure out what type the node is in archive node type
+    // figure out what type the node is in archive node type
     NodeType archiveNodeType;
-    switch(node_type){
-            case clp::ffi::SchemaTreeNode::Type::Int : 
-                archiveNodeType = NodeType::Integer;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Float : 
-                archiveNodeType = NodeType::Float;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Bool : 
-                archiveNodeType = NodeType::Boolean;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : 
-                archiveNodeType = NodeType::UnstructuredArray;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Str :
-                //std::cerr << "In str\n";
-                if(p.second.value().is<std::string>()){
-                    //maybe special case for date string
-                    archiveNodeType = NodeType::VarString;
-                }else{
-                    archiveNodeType = NodeType::ClpString;
-                }
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Obj :
-                //std::cerr << "In obj\n"; 
-                if(p.second.has_value()){
-                    if(p.second.value().is_null()){
-                        //std::cout << "Found Null\n";
-                        archiveNodeType = NodeType::NullValue;
-                    }else{
-                        archiveNodeType = NodeType::Object;
-                    }
-                }else{
+    switch (node_type) {
+        case clp::ffi::SchemaTreeNode::Type::Int:
+            archiveNodeType = NodeType::Integer;
+            break;
+        case clp::ffi::SchemaTreeNode::Type::Float:
+            archiveNodeType = NodeType::Float;
+            break;
+        case clp::ffi::SchemaTreeNode::Type::Bool:
+            archiveNodeType = NodeType::Boolean;
+            break;
+        case clp::ffi::SchemaTreeNode::Type::UnstructuredArray:
+            archiveNodeType = NodeType::UnstructuredArray;
+            break;
+        case clp::ffi::SchemaTreeNode::Type::Str:
+            // std::cerr << "In str\n";
+            if (p.second.value().is<std::string>()) {
+                // maybe special case for date string
+                archiveNodeType = NodeType::VarString;
+            } else {
+                archiveNodeType = NodeType::ClpString;
+            }
+            break;
+        case clp::ffi::SchemaTreeNode::Type::Obj:
+            // std::cerr << "In obj\n";
+            if (p.second.has_value()) {
+                if (p.second.value().is_null()) {
+                    // std::cout << "Found Null\n";
+                    archiveNodeType = NodeType::NullValue;
+                } else {
                     archiveNodeType = NodeType::Object;
                 }
-                break;
-            default : 
-                archiveNodeType = NodeType::Unknown;
-                break;
-            //Do I need to do anything for structured arrays
+            } else {
+                archiveNodeType = NodeType::Object;
+            }
+            break;
+        default:
+            archiveNodeType = NodeType::Unknown;
+            break;
     }
-    //std::cerr << "After Switch\n";
     return archiveNodeType;
 }
 
 //
-int JsonParser::get_archive_node_id(std::map< std::tuple<int, NodeType>, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree){
-    //std::cerr << "In get archive node id\n";
-    std::tuple<int, NodeType> key (irNodeID, archiveNodeType);
-    if(cache.find(key) != cache.end()){
-        //std::cerr << "Found value\n";
+int JsonParser::get_archive_node_id(
+        std::map<std::tuple<int, NodeType>, int>& cache,
+        int irNodeID,
+        NodeType archiveNodeType,
+        clp::ffi::SchemaTree const& irTree
+) {
+    std::tuple<int, NodeType> key(irNodeID, archiveNodeType);
+    if (cache.find(key) != cache.end()) {
         return cache[key];
     }
-    auto& currNode =  irTree.get_node(irNodeID);
-    //std::cerr << "Got node\n";
+    auto& currNode = irTree.get_node(irNodeID);
     int parent_node_id;
-    if(currNode.get_parent_id() == 0){
-        //std::cout << "Hit the root\n";
+    // Found the root
+    if (currNode.get_parent_id() == 0) {
         parent_node_id = 0;
-    }else{
-        //std::cerr << "Look for parent id\n";
-        parent_node_id = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree);
-        //std::cerr << "Got parent id\n";
+    } else {
+        parent_node_id
+                = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree);
     }
-    std::string nodeKey = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value();
-    //std::string nodeKey = static_cast<std::string>(validated_key);
+    std::string nodeKey
+            = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value();
     int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey);
-    //std::cerr << "Added node to archive\n";
     cache[key] = curr_node_archive_id;
-    //std::cerr << "Added to cache\n";
     return curr_node_archive_id;
 }
 
-void print_kv_log_event(KeyValuePairLogEvent const& kv){
+void print_kv_log_event(KeyValuePairLogEvent const& kv) {
     auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
     std::cout << "number of kv pairs: " << num_kv_pairs << std::endl;
     auto const& tree = kv.get_schema_tree();
-    for (auto const &pair: kv.get_node_id_value_pairs()){
+    for (auto const& pair : kv.get_node_id_value_pairs()) {
         auto const& tree_node = tree.get_node(pair.first);
         auto const node_type = tree_node.get_type();
-        switch(node_type){
-                case clp::ffi::SchemaTreeNode::Type::Int : std::cout << "Int" << std::endl; break;
-                case clp::ffi::SchemaTreeNode::Type::Float : std::cout << "Float" << std::endl; break;
-                case clp::ffi::SchemaTreeNode::Type::Bool : std::cout << "Bool" << std::endl; break;
-                case clp::ffi::SchemaTreeNode::Type::Str : std::cout << "Str" << std::endl; break;
-                case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : std::cout << "UArray" << std::endl; break;
-                case clp::ffi::SchemaTreeNode::Type::Obj : std::cout << "Obj" << std::endl; break;
-                default : std::cout << "???" << std::endl; break;
+        switch (node_type) {
+            case clp::ffi::SchemaTreeNode::Type::Int:
+                std::cout << "Int" << std::endl;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Float:
+                std::cout << "Float" << std::endl;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Bool:
+                std::cout << "Bool" << std::endl;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Str:
+                std::cout << "Str" << std::endl;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::UnstructuredArray:
+                std::cout << "UArray" << std::endl;
+                break;
+            case clp::ffi::SchemaTreeNode::Type::Obj:
+                std::cout << "Obj" << std::endl;
+                break;
+            default:
+                std::cout << "???" << std::endl;
+                break;
         }
-        
-        if(!pair.second.has_value()){
-                std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n";
-                continue;
+
+        if (!pair.second.has_value()) {
+            std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n";
+            continue;
         }
-        if(pair.second.value().is<clp::ffi::value_int_t>()){
-                std::cout << "{INT:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_int_t>() << "}\n";
-        }else if(pair.second.value().is<clp::ffi::value_float_t>()){
-                std::cout << "{FLOAT:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_float_t>() << "}\n";
-        }else if(pair.second.value().is<clp::ffi::value_bool_t>()){
-                std::cout << "{BOOL:\t" << pair.first << ": " << pair.second.value().get_immutable_view<clp::ffi::value_bool_t>() << "}\n";
-        }else if(pair.second.value().is<std::string>()){
-                std::cout << "{STRING:\t" << pair.first << ": " << pair.second.value().get_immutable_view<std::string>() << "}\n";
-        }else if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
-                std::cout << "{EIGHTByte:\t" << pair.first << ": \n";
-                auto decoded = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse();
-                if(std::nullopt != decoded){
-                        std:: cout << "\t Decoded & Unparsed: "<< decoded.value()<< std::endl;
-                }else{
-                        std::cout << "\tNULL\n";
-                }
-                std::cout << "}\n";
-        }else if(pair.second.value().is<clp::ir::FourByteEncodedTextAst>()){
-                std::cout << "{FOURByte:\t" << pair.first << ": \n";
-                auto decoded = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse();
-                if(std::nullopt != decoded){
-                        std:: cout << "\tDecoded & Unparsed: "<< decoded.value() << std::endl;
-                }else{
-                        std::cout << "\tNULL\n";
-                }
-                std::cout << "}\n";
-        }else{
-                std::cout << "Unknown Type:\t" << pair.first << "\n";
+        if (pair.second.value().is<clp::ffi::value_int_t>()) {
+            std::cout << "{INT:\t" << pair.first << ": "
+                      << pair.second.value().get_immutable_view<clp::ffi::value_int_t>() << "}\n";
+        } else if (pair.second.value().is<clp::ffi::value_float_t>()) {
+            std::cout << "{FLOAT:\t" << pair.first << ": "
+                      << pair.second.value().get_immutable_view<clp::ffi::value_float_t>() << "}\n";
+        } else if (pair.second.value().is<clp::ffi::value_bool_t>()) {
+            std::cout << "{BOOL:\t" << pair.first << ": "
+                      << pair.second.value().get_immutable_view<clp::ffi::value_bool_t>() << "}\n";
+        } else if (pair.second.value().is<std::string>()) {
+            std::cout << "{STRING:\t" << pair.first << ": "
+                      << pair.second.value().get_immutable_view<std::string>() << "}\n";
+        } else if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
+            std::cout << "{EIGHTByte:\t" << pair.first << ": \n";
+            auto decoded = pair.second.value()
+                                   .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
+                                   .decode_and_unparse();
+            if (std::nullopt != decoded) {
+                std::cout << "\t Decoded & Unparsed: " << decoded.value() << std::endl;
+            } else {
+                std::cout << "\tNULL\n";
+            }
+            std::cout << "}\n";
+        } else if (pair.second.value().is<clp::ir::FourByteEncodedTextAst>()) {
+            std::cout << "{FOURByte:\t" << pair.first << ": \n";
+            auto decoded = pair.second.value()
+                                   .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
+                                   .decode_and_unparse();
+            if (std::nullopt != decoded) {
+                std::cout << "\tDecoded & Unparsed: " << decoded.value() << std::endl;
+            } else {
+                std::cout << "\tNULL\n";
+            }
+            std::cout << "}\n";
+        } else {
+            std::cout << "Unknown Type:\t" << pair.first << "\n";
         }
-
     }
     std::cout << "after for loop\n\n\n";
 }
 
-void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std::tuple<int, NodeType>,  int>& cache){
+void JsonParser::parse_kv_log_event(
+        KeyValuePairLogEvent const& kv,
+        std::map<std::tuple<int, NodeType>, int>& cache
+) {
     auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
     clp::ffi::SchemaTree const& tree = kv.get_schema_tree();
-    //std::cerr << "In parse\n";
-    for (auto const& pair: kv.get_node_id_value_pairs()){
-        //std::cerr << "In for loop\n";
+
+    for (auto const& pair : kv.get_node_id_value_pairs()) {
         clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first);
-        //std::cerr << "After get node\n";
         NodeType archiveNodeType = get_archive_node_type(tree_node, pair);
-        //std::cerr << "After get archive node type\n";
         int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree);
-        //std::cerr << "After get_archive_node_id\n";
-        //std::cerr << node_id << std::endl;
-        switch(archiveNodeType){
-            case NodeType::Integer :{
+
+        switch (archiveNodeType) {
+            case NodeType::Integer: {
                 int64_t i64_value = pair.second.value().get_immutable_view<clp::ffi::value_int_t>();
                 m_current_parsed_message.add_value(node_id, i64_value);
-            }break;
-            case NodeType::Float :{
+            } break;
+            case NodeType::Float: {
                 double d_value = pair.second.value().get_immutable_view<clp::ffi::value_float_t>();
                 m_current_parsed_message.add_value(node_id, d_value);
-            }break;
-            case NodeType::Boolean :{
+            } break;
+            case NodeType::Boolean: {
                 bool b_value = pair.second.value().get_immutable_view<clp::ffi::value_bool_t>();
                 m_current_parsed_message.add_value(node_id, b_value);
-            }break; 
-            case NodeType::VarString :{
-                std::string str = clp::ffi::validate_and_escape_utf8_string(pair.second.value().get_immutable_view<std::string>()).value();
+            } break;
+            case NodeType::VarString: {
+                std::string str = clp::ffi::validate_and_escape_utf8_string(
+                                          pair.second.value().get_immutable_view<std::string>()
+                )
+                                          .value();
                 m_current_parsed_message.add_value(node_id, str);
-            }break;
-            case NodeType::ClpString :{
-                //auto const node_type = tree_node.get_type();
+            } break;
+            case NodeType::ClpString: {
                 std::string encoded_str;
-                ///Do I need to reparse these? Do I need to convert 4bytes to 8bytes .... how?
-                if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
-                    std::string decodedValue = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse().value();
-                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value();
-                }else{
-                    std::string decodedValue = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse().value();
-                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value();
+                if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
+                    std::string decodedValue
+                            = pair.second.value()
+                                      .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
+                                      .decode_and_unparse()
+                                      .value();
+                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str())
+                                          .value();
+                } else {
+                    std::string decodedValue
+                            = pair.second.value()
+                                      .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
+                                      .decode_and_unparse()
+                                      .value();
+                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str())
+                                          .value();
                 }
                 m_current_parsed_message.add_value(node_id, encoded_str);
-            }break;
-            case NodeType::UnstructuredArray :{
-                //auto const encoded_type = tree_node.get_type();
+            } break;
+            case NodeType::UnstructuredArray: {
                 std::string array_str;
-                if(pair.second.value().is<clp::ir::EightByteEncodedTextAst>()){
-                    array_str = pair.second.value().get_immutable_view<clp::ir::EightByteEncodedTextAst>().decode_and_unparse().value();
-                }else{
-                    array_str = pair.second.value().get_immutable_view<clp::ir::FourByteEncodedTextAst>().decode_and_unparse().value();
+                if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
+                    array_str = pair.second.value()
+                                        .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
+                                        .decode_and_unparse()
+                                        .value();
+                } else {
+                    array_str = pair.second.value()
+                                        .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
+                                        .decode_and_unparse()
+                                        .value();
                 }
                 m_current_parsed_message.add_value(node_id, array_str);
                 break;
             }
-            default : 
-                //Don't need to add value for obj or null
+            default:
+                // Don't need to add value for obj or null
                 break;
         }
         m_current_schema.insert_ordered(node_id);
-    } 
+    }
 
     int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema);
     m_current_parsed_message.set_id(current_schema_id);
@@ -722,98 +756,67 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std
 }
 
 bool JsonParser::parse_from_IR() {
-    std::map<std::tuple<int, NodeType>,  int> id_conversion_cache;
+    std::map<std::tuple<int, NodeType>, int> id_conversion_cache;
     m_archive_writer->add_node(-1, NodeType::Unknown, "root");
-    //int fileNum = 0;
+
     for (auto& file_path : m_file_paths) {
         std::cout << file_path << std::endl;
         std::vector<char> ir_buf;
-        //Make function from reading in this file
-        char temp_ir_buf[10000];
-        //char* new_ir_buf = (char *) malloc(ir_buf.size());
+        char temp_ir_buf[10'000];
         FileReader infile;
         infile.open(file_path);
-        if(false == infile.is_open()){
+        if (false == infile.is_open()) {
             m_archive_writer->close();
             return false;
         }
         int fsize = std::filesystem::file_size(file_path);
-        if(0  == fsize){
+        if (0 == fsize) {
             m_archive_writer->close();
             return false;
         }
         ZstdDecompressor zd;
         zd.open(infile, fsize);
         size_t num_bytes_read = 0;
-        do{
+        do {
             num_bytes_read = 0;
-            zd.try_read(temp_ir_buf, 10000, num_bytes_read);
-            if (num_bytes_read != 0){
-                ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read);
+            zd.try_read(temp_ir_buf, 10'000, num_bytes_read);
+            if (num_bytes_read != 0) {
+                ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf + num_bytes_read);
             }
-        }while (num_bytes_read == 10000);
+        } while (num_bytes_read == 10'000);
         zd.close();
-        infile.close(); 
-        /* std::cout << "IR BUFFER\n";
-        for (size_t i = 0; i < ir_buf.size(); ++i) {
-            std::cout << ir_buf.data()[i];
-        }
-        std::cout << "\n\n\n"; */
+        infile.close();
+
         BufferReader reader{size_checked_pointer_cast<char>(ir_buf.data()), ir_buf.size()};
         char const* p;
         size_t p_size;
-        //reader.peek_buffer(p, p_size);
-        //std::cout << "Num Bytes in buffer left: " << p_size << std::endl;
-        //for(int z = 0; z < p_size; z++){
-        //    std::cout << p[z];
-        //}
-        //std::cout << std::endl;
 
         auto deserializer_result = Deserializer::create(reader);
-        if(deserializer_result.has_error()){
+        if (deserializer_result.has_error()) {
             m_archive_writer->close();
             return false;
         }
         auto& deserializer = deserializer_result.value();
 
-
         m_num_messages = 0;
-        //size_t bytes_consumed_up_to_prev_archive = 0;
-        //size_t bytes_consumed_up_to_prev_record = 0;
-        //int iterations = 2;
-        do{
-            //iterations--;
-            //std::cerr << "In do while loop\n";
+        do {
             auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader);
-            //std::cerr << "After deserialize\n";
-
-            //reader.peek_buffer(p, p_size);
-            //std::cout << "Num Bytes in buffer left: " << p_size << std::endl;
-            //for(int z = 0; z < p_size; z++){
-            //    std::cout << p[z];
-            //}
-            //std::cout << std::endl;
-
-            if(kv_log_event_result.has_error()){
-                //std::cerr << "has error\n";
-                if(kv_log_event_result.error() == std::errc::no_message_available || kv_log_event_result.error() == std::errc::result_out_of_range){
-                    //std::cerr << "Breaking out of do while loop\n";
+
+            if (kv_log_event_result.has_error()) {
+                if (kv_log_event_result.error() == std::errc::no_message_available
+                    || kv_log_event_result.error() == std::errc::result_out_of_range)
+                {
                     break;
                 }
             }
-            //std::cerr << "After error check\n";
+
             m_current_schema.clear();
             auto const& kv_log_event = kv_log_event_result.value();
 
-            //print_kv_log_event(kv_log_event);
-            /*if (fileNum > 0){
-                std::cout << "before parse\n";
-                print_kv_log_event(kv_log_event);
-            }*/
+            // print_kv_log_event(kv_log_event);
             parse_kv_log_event(kv_log_event, id_conversion_cache);
-            //std::cerr << "After parse\n";
+
             m_num_messages++;
-            //Implement archive splitting and size tracking
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
                 std::cerr << "Splitting Archive\n\n";
                 id_conversion_cache.clear();
@@ -823,10 +826,8 @@ bool JsonParser::parse_from_IR() {
 
             m_current_parsed_message.clear();
 
-        } while(1);//while(iterations > 0);
-        //std::cout << "Out of do while loop\n";
+        } while (1);
         id_conversion_cache.clear();
-        //fileNum++;
     }
     return true;
 }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 6927a178d..1e1aa96fa 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -9,7 +9,16 @@
 #include <boost/uuid/random_generator.hpp>
 #include <simdjson.h>
 
+#include "../clp/BufferReader.hpp"
+#include "../clp/ffi/ir_stream/Deserializer.hpp"
+#include "../clp/ffi/KeyValuePairLogEvent.hpp"
+#include "../clp/ffi/SchemaTree.hpp"
+#include "../clp/ffi/SchemaTreeNode.hpp"
+#include "../clp/ffi/utils.hpp"
+#include "../clp/ffi/Value.hpp"
 #include "../clp/GlobalMySQLMetadataDB.hpp"
+#include "../clp/ir/types.hpp"
+#include "../clp/type_utils.hpp"
 #include "ArchiveWriter.hpp"
 #include "DictionaryWriter.hpp"
 #include "FileReader.hpp"
@@ -22,20 +31,11 @@
 #include "TimestampDictionaryWriter.hpp"
 #include "Utils.hpp"
 #include "ZstdCompressor.hpp"
-#include "../clp/ffi/ir_stream/Deserializer.hpp"
-#include "../clp/BufferReader.hpp"
-#include "../clp/type_utils.hpp"
-#include "../clp/ffi/Value.hpp"
-#include "../clp/ffi/KeyValuePairLogEvent.hpp"
-#include "../clp/ffi/SchemaTree.hpp"
-#include "../clp/ffi/SchemaTreeNode.hpp"
-#include "../clp/ffi/utils.hpp"
-#include "../clp/ir/types.hpp"
 
-using clp::size_checked_pointer_cast;
 using clp::BufferReader;
 using clp::ffi::ir_stream::Deserializer;
 using clp::ffi::KeyValuePairLogEvent;
+using clp::size_checked_pointer_cast;
 
 using namespace simdjson;
 
@@ -107,18 +107,28 @@ class JsonParser {
     /**
      * Parses a Key Value Log Event
      * @param kv the key value log event
-     * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes
+     * @param cache cache of node id conversions between deserializer schema tree nodes and archive
+     * schema tree nodes
      */
-    void parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map<std::tuple<int, NodeType>, int>& cache);
+    void parse_kv_log_event(
+            KeyValuePairLogEvent const& kv,
+            std::map<std::tuple<int, NodeType>, int>& cache
+    );
 
     /**
      * Get archive node id for ir node
-     * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes
+     * @param cache cache of node id conversions between deserializer schema tree nodes and archive
+     * schema tree nodes
      * @param irNodeID
      * @param irType
      * @param irTree
      */
-    int get_archive_node_id(std::map < std::tuple<int, NodeType>,  int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree);
+    int get_archive_node_id(
+            std::map<std::tuple<int, NodeType>, int>& cache,
+            int irNodeID,
+            NodeType archiveNodeType,
+            clp::ffi::SchemaTree const& irTree
+    );
 
     /**
      * Parses an array within a JSON line
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index f4c6a5b95..911e92c1d 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -1,22 +1,24 @@
 #include <exception>
 #include <filesystem>
+#include <fstream>
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
-#include <fstream>
 
 #include <json/single_include/nlohmann/json.hpp>
 #include <mongocxx/instance.hpp>
 #include <spdlog/sinks/stdout_sinks.h>
 #include <spdlog/spdlog.h>
 
+#include "../clp/ffi/ir_stream/Serializer.hpp"
 #include "../clp/GlobalMySQLMetadataDB.hpp"
 #include "../clp/streaming_archive/ArchiveMetadata.hpp"
 #include "../reducer/network_utils.hpp"
 #include "CommandLineArguments.hpp"
 #include "Defs.hpp"
+#include "FileWriter.hpp"
 #include "JsonConstructor.hpp"
 #include "JsonParser.hpp"
 #include "ReaderUtils.hpp"
@@ -34,16 +36,14 @@
 #include "TimestampPattern.hpp"
 #include "TraceableException.hpp"
 #include "Utils.hpp"
-#include "FileWriter.hpp"
 #include "ZstdCompressor.hpp"
-#include "../clp/ffi/ir_stream/Serializer.hpp"
 
 using namespace clp_s::search;
+using clp::ffi::ir_stream::Serializer;
 using clp_s::cArchiveFormatDevelopmentVersionFlag;
 using clp_s::cEpochTimeMax;
 using clp_s::cEpochTimeMin;
 using clp_s::CommandLineArguments;
-using clp::ffi::ir_stream::Serializer;
 
 namespace {
 /**
@@ -155,10 +155,10 @@ auto unpack_and_serialize_msgpack_bytes(
 }
 
 template <typename T>
-auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){
-    //std::cout << "Running Serializer\n";
+auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
+    // std::cout << "Running Serializer\n";
     auto result{Serializer<T>::create()};
-    if (result.has_error()){
+    if (result.has_error()) {
         SPDLOG_ERROR("Failed to create Serializer");
         return false;
     }
@@ -167,71 +167,67 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){
     flush_and_clear_serializer_buffer(serializer, ir_buf);
 
     std::ifstream inFile;
-    inFile.open(path, std::ifstream::in); 
-    //std::cout << "Opened Input file\n";
+    inFile.open(path, std::ifstream::in);
+    // std::cout << "Opened Input file\n";
 
     std::string outPath = "";
     int index = path.find_last_of('/');
-    if(std::string::npos == index){
+    if (std::string::npos == index) {
         outPath = option.irs_dir + "/" + path + ".ir";
-    }else{
-        outPath = option.irs_dir + "/" + path.substr(index, path.length()-index) + ".ir";
+    } else {
+        outPath = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir";
     }
     clp_s::FileWriter outFile;
-    //std::cout << outPath << "\n";
+    // std::cout << outPath << "\n";
     outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting);
     clp_s::ZstdCompressor zc;
     zc.open(outFile, option.compression_level);
 
-    std::string line; 
+    std::string line;
     size_t totalSize = 0;
 
-    if (inFile.is_open()) { 
-        while (getline(inFile, line)) { 
-                auto j_obj = nlohmann::json::parse(line);
-                unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer);
-                flush_and_clear_serializer_buffer(serializer, ir_buf);
-                if(ir_buf.size() >= 1000000000){
-                        totalSize = totalSize + ir_buf.size();
-                        zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
-                        zc.flush();
-                        ir_buf.clear();
-                }
+    if (inFile.is_open()) {
+        while (getline(inFile, line)) {
+            auto j_obj = nlohmann::json::parse(line);
+            unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer);
+            flush_and_clear_serializer_buffer(serializer, ir_buf);
+            if (ir_buf.size() >= 1'000'000'000) {
+                totalSize = totalSize + ir_buf.size();
+                zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
+                zc.flush();
+                ir_buf.clear();
+            }
         }
-        totalSize = totalSize + ir_buf.size(); 
+        totalSize = totalSize + ir_buf.size();
         zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
         zc.flush();
         ir_buf.clear();
-        inFile.close(); 
+        inFile.close();
         zc.close();
         outFile.close();
-    } 
+    }
 
     return true;
 }
 
-bool generate_IR(CommandLineArguments const& command_line_arguments){
+bool generate_IR(CommandLineArguments const& command_line_arguments) {
     auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
 
     // Create output directory in case it doesn't exist
     try {
         std::filesystem::create_directory(irs_dir.string());
     } catch (std::exception& e) {
-        SPDLOG_ERROR(
-                "Failed to create archives directory {} - {}",
-                irs_dir.string(),
-                e.what()
-        );
+        SPDLOG_ERROR("Failed to create archives directory {} - {}", irs_dir.string(), e.what());
         return false;
     }
     clp_s::JsonToIRParserOption option{};
     option.file_paths = command_line_arguments.get_file_paths();
     option.irs_dir = irs_dir.string();
-    //std::cout << "IRs dir: " << option.irs_dir << std::endl;
+    // std::cout << "IRs dir: " << option.irs_dir << std::endl;
     option.max_document_size = command_line_arguments.get_max_document_size();
     option.compression_level = command_line_arguments.get_compression_level();
     option.encoding = command_line_arguments.get_encoding_type();
-    //std::cout << "encoding type: " << static_cast<int>(option.encoding) << std::endl;
+    // std::cout << "encoding type: " << static_cast<int>(option.encoding) << std::endl;
 
     if (false == clp_s::FileUtils::validate_path(option.file_paths)) {
         exit(1);
@@ -244,14 +240,14 @@ bool generate_IR(CommandLineArguments const& command_line_arguments){
 
     for (auto& path : all_file_paths) {
         bool success;
-        if (option.encoding == 4){
-            //std::cout << "four byte\n";
+        if (option.encoding == 4) {
+            // std::cout << "four byte\n";
             success = run_serializer<int32_t>(option, path);
-        }else{
-            //std::cout << "eight byte\n";
+        } else {
+            // std::cout << "eight byte\n";
             success = run_serializer<int64_t>(option, path);
         }
-        if (false == success){
+        if (false == success) {
             return false;
         }
     }
@@ -277,13 +273,14 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) {
     option.file_paths = command_line_arguments.get_file_paths();
     option.archives_dir = archives_dir.string();
     option.target_encoded_size = command_line_arguments.get_target_encoded_size();
-    //Do I need max_document_size()
+    // Do I need max_document_size()
     option.max_document_size = command_line_arguments.get_max_document_size();
     option.compression_level = command_line_arguments.get_compression_level();
     option.timestamp_key = command_line_arguments.get_timestamp_key();
     option.print_archive_stats = command_line_arguments.print_archive_stats();
-    //Is this an option they can make after IR or is that made before and has to be what is in the IR stream already
-    //option.structurize_arrays = command_line_arguments.get_structurize_arrays();
+    // Is this an option they can make after IR or is that made before and has to be what is in the
+    // IR stream already option.structurize_arrays =
+    // command_line_arguments.get_structurize_arrays();
 
     auto const& db_config_container = command_line_arguments.get_metadata_db_config();
     if (db_config_container.has_value()) {
@@ -462,7 +459,7 @@ int main(int argc, char const* argv[]) {
         if (false == generate_IR(command_line_arguments)) {
             return 1;
         }
-    }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {
+    } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {
         auto const& archives_dir = command_line_arguments.get_archives_dir();
         if (false == std::filesystem::is_directory(archives_dir)) {
             SPDLOG_ERROR("'{}' is not a directory.", archives_dir);

From 4dce1607d8dd3a4098153167061f85314815e1b4 Mon Sep 17 00:00:00 2001
From: AVMatthews <abigail.v.matthews@gmail.com>
Date: Sat, 21 Sep 2024 07:16:01 -0400
Subject: [PATCH 04/15] small linting fix

---
 components/core/src/clp_s/CommandLineArguments.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 0435cd3f6..530dad3fb 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -352,8 +352,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 std::cerr << "Examples:" << std::endl;
                 std::cerr << "  # Compress file1.ir and dir1 into archives-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " i archives-dir file1.ir dir1"
-                          << std::endl;
+                std::cerr << "  " << m_program_name << " i archives-dir file1.ir dir1" << std::endl;
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -471,8 +470,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 std::cerr << "Examples:" << std::endl;
                 std::cerr << "  # Parse file1.json and dir1 into irs-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " r irs-dir file1.json dir1"
-                          << std::endl;
+                std::cerr << "  " << m_program_name << " r irs-dir file1.json dir1" << std::endl;
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -1042,8 +1040,7 @@ void CommandLineArguments::print_search_usage() const {
 }
 
 void CommandLineArguments::print_json_to_IR_usage() const {
-    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]"
-              << std::endl;
+    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl;
 }
 
 void CommandLineArguments::print_IR_compression_usage() const {

From 424cb857255edf28db7a57a80c144c8b9fa11ad8 Mon Sep 17 00:00:00 2001
From: AVMatthews <abigail.v.matthews@gmail.com>
Date: Mon, 23 Sep 2024 13:40:38 -0400
Subject: [PATCH 05/15] modified IR file I/O to use Decompressor

---
 components/core/src/clp_s/CMakeLists.txt |  6 +++++
 components/core/src/clp_s/JsonParser.cpp | 33 +++++-------------------
 components/core/src/clp_s/JsonParser.hpp |  1 +
 3 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 7b684632c..81ea74f70 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -63,6 +63,12 @@ set(
         ../clp/ffi/utils.cpp
         ../clp/utf8_utils.hpp
         ../clp/utf8_utils.cpp
+        ../clp/streaming_compression/zstd/Decompressor.hpp
+        ../clp/streaming_compression/zstd/Decompressor.cpp
+        ../clp/ReadOnlyMemoryMappedFile.hpp
+        ../clp/ReadOnlyMemoryMappedFile.cpp
+        ../clp/FileDescriptor.hpp
+        ../clp/FileDescriptor.cpp
 )
 
 set(
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 6647d3d7a..95e788e6d 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -760,38 +760,15 @@ bool JsonParser::parse_from_IR() {
     m_archive_writer->add_node(-1, NodeType::Unknown, "root");
 
     for (auto& file_path : m_file_paths) {
-        std::cout << file_path << std::endl;
-        std::vector<char> ir_buf;
-        char temp_ir_buf[10'000];
-        FileReader infile;
-        infile.open(file_path);
-        if (false == infile.is_open()) {
-            m_archive_writer->close();
-            return false;
-        }
         int fsize = std::filesystem::file_size(file_path);
         if (0 == fsize) {
             m_archive_writer->close();
             return false;
         }
-        ZstdDecompressor zd;
-        zd.open(infile, fsize);
-        size_t num_bytes_read = 0;
-        do {
-            num_bytes_read = 0;
-            zd.try_read(temp_ir_buf, 10'000, num_bytes_read);
-            if (num_bytes_read != 0) {
-                ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf + num_bytes_read);
-            }
-        } while (num_bytes_read == 10'000);
-        zd.close();
-        infile.close();
+        clp::streaming_compression::zstd::Decompressor zd;
+        zd.open(file_path);
 
-        BufferReader reader{size_checked_pointer_cast<char>(ir_buf.data()), ir_buf.size()};
-        char const* p;
-        size_t p_size;
-
-        auto deserializer_result = Deserializer::create(reader);
+        auto deserializer_result = Deserializer::create(zd);
         if (deserializer_result.has_error()) {
             m_archive_writer->close();
             return false;
@@ -800,7 +777,7 @@ bool JsonParser::parse_from_IR() {
 
         m_num_messages = 0;
         do {
-            auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader);
+            auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(zd);
 
             if (kv_log_event_result.has_error()) {
                 if (kv_log_event_result.error() == std::errc::no_message_available
@@ -828,6 +805,8 @@ bool JsonParser::parse_from_IR() {
 
         } while (1);
         id_conversion_cache.clear();
+        zd.close();
+        //infile.close();
     }
     return true;
 }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 1e1aa96fa..62c54df8d 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -31,6 +31,7 @@
 #include "TimestampDictionaryWriter.hpp"
 #include "Utils.hpp"
 #include "ZstdCompressor.hpp"
+#include "../clp/streaming_compression/zstd/Decompressor.hpp"
 
 using clp::BufferReader;
 using clp::ffi::ir_stream::Deserializer;

From 1899c619bd1ceb8e1e9476c142bdd4d356fea2af Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Tue, 24 Sep 2024 16:20:14 -0400
Subject: [PATCH 06/15] updates from first round of review, and linting

---
 components/core/CMakeLists.txt           |   6 -
 components/core/src/clp_s/CMakeLists.txt |  80 ++++-----
 components/core/src/clp_s/JsonParser.cpp | 199 ++++++++---------------
 components/core/src/clp_s/JsonParser.hpp |   2 +-
 components/core/src/clp_s/clp-s.cpp      |  41 ++---
 5 files changed, 123 insertions(+), 205 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 9ff3527ae..a7c6d5a90 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -278,12 +278,6 @@ set(SOURCE_FILES_clp_s_unitTest
     src/clp_s/TimestampPattern.hpp
     src/clp_s/Utils.cpp
     src/clp_s/Utils.hpp
-    src/clp_s/ZstdCompressor.hpp
-    src/clp_s/ZstdCompressor.cpp
-    src/clp_s/ZstdDecompressor.hpp
-    src/clp_s/ZstdDecompressor.cpp
-    src/clp_s/FileWriter.cpp
-    src/clp_s/FileReader.cpp
 )
 
 set(SOURCE_FILES_unitTest
diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 81ea74f70..85a93afbb 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -2,12 +2,13 @@ add_subdirectory(search/kql)
 
 set(
         CLP_SOURCES
-        ../clp/cli_utils.cpp
-        ../clp/cli_utils.hpp
-        ../clp/database_utils.cpp
-        ../clp/database_utils.hpp
+        ../clp/BufferReader.cpp
+        ../clp/BufferReader.hpp
         ../clp/Defs.h
         ../clp/ErrorCode.hpp
+        ../clp/ErrorCode.hpp
+        ../clp/FileDescriptor.cpp
+        ../clp/FileDescriptor.hpp
         ../clp/GlobalMetadataDB.hpp
         ../clp/GlobalMetadataDBConfig.cpp
         ../clp/GlobalMetadataDBConfig.hpp
@@ -19,56 +20,55 @@ set(
         ../clp/MySQLParamBindings.hpp
         ../clp/MySQLPreparedStatement.cpp
         ../clp/MySQLPreparedStatement.hpp
-        ../clp/networking/socket_utils.cpp
-        ../clp/networking/socket_utils.hpp
+        ../clp/ReadOnlyMemoryMappedFile.cpp
+        ../clp/ReadOnlyMemoryMappedFile.hpp
+        ../clp/ReaderInterface.cpp
         ../clp/ReaderInterface.cpp
         ../clp/ReaderInterface.hpp
-        ../clp/streaming_archive/ArchiveMetadata.cpp
-        ../clp/streaming_archive/ArchiveMetadata.hpp
+        ../clp/ReaderInterface.hpp
         ../clp/TraceableException.hpp
         ../clp/WriterInterface.cpp
         ../clp/WriterInterface.hpp
-        ../clp/ffi/ir_stream/Deserializer.hpp
-        ../clp/ffi/ir_stream/Deserializer.cpp
-        ../clp/ffi/ir_stream/Serializer.hpp
-        ../clp/ffi/ir_stream/Serializer.cpp
-        ../clp/ffi/ir_stream/utils.hpp
-        ../clp/ffi/ir_stream/utils.cpp
-        ../clp/BufferReader.hpp
-        ../clp/BufferReader.cpp
-        ../clp/type_utils.hpp
-        ../clp/ffi/Value.hpp
-        ../clp/ErrorCode.hpp
-        ../clp/ir/EncodedTextAst.hpp
-        ../clp/ir/EncodedTextAst.cpp
-        ../clp/ir/types.hpp
-        ../clp/ReaderInterface.hpp
-        ../clp/ReaderInterface.cpp
-        ../clp/time_types.hpp
-        ../clp/type_utils.hpp
-        ../clp/ffi/KeyValuePairLogEvent.hpp
+        ../clp/cli_utils.cpp
+        ../clp/cli_utils.hpp
+        ../clp/database_utils.cpp
+        ../clp/database_utils.hpp
         ../clp/ffi/KeyValuePairLogEvent.cpp
-        ../clp/ffi/SchemaTree.hpp
+        ../clp/ffi/KeyValuePairLogEvent.hpp
         ../clp/ffi/SchemaTree.cpp
+        ../clp/ffi/SchemaTree.hpp
         ../clp/ffi/SchemaTreeNode.hpp
         ../clp/ffi/Value.hpp
-        ../clp/ffi/ir_stream/decoding_methods.hpp
+        ../clp/ffi/Value.hpp
+        ../clp/ffi/ir_stream/Deserializer.cpp
+        ../clp/ffi/ir_stream/Deserializer.hpp
+        ../clp/ffi/ir_stream/Serializer.cpp
+        ../clp/ffi/ir_stream/Serializer.hpp
         ../clp/ffi/ir_stream/decoding_methods.cpp
-        ../clp/ffi/ir_stream/encoding_methods.hpp
+        ../clp/ffi/ir_stream/decoding_methods.hpp
         ../clp/ffi/ir_stream/encoding_methods.cpp
-        ../clp/ir/parsing.hpp
-        ../clp/ir/parsing.cpp
+        ../clp/ffi/ir_stream/encoding_methods.hpp
         ../clp/ffi/ir_stream/protocol_constants.hpp
-        ../clp/ffi/utils.hpp
+        ../clp/ffi/ir_stream/utils.cpp
+        ../clp/ffi/ir_stream/utils.hpp
         ../clp/ffi/utils.cpp
-        ../clp/utf8_utils.hpp
-        ../clp/utf8_utils.cpp
-        ../clp/streaming_compression/zstd/Decompressor.hpp
+        ../clp/ffi/utils.hpp
+        ../clp/ir/EncodedTextAst.cpp
+        ../clp/ir/EncodedTextAst.hpp
+        ../clp/ir/parsing.cpp
+        ../clp/ir/parsing.hpp
+        ../clp/ir/types.hpp
+        ../clp/networking/socket_utils.cpp
+        ../clp/networking/socket_utils.hpp
+        ../clp/streaming_archive/ArchiveMetadata.cpp
+        ../clp/streaming_archive/ArchiveMetadata.hpp
         ../clp/streaming_compression/zstd/Decompressor.cpp
-        ../clp/ReadOnlyMemoryMappedFile.hpp
-        ../clp/ReadOnlyMemoryMappedFile.cpp
-        ../clp/FileDescriptor.hpp
-        ../clp/FileDescriptor.cpp
+        ../clp/streaming_compression/zstd/Decompressor.hpp
+        ../clp/time_types.hpp
+        ../clp/type_utils.hpp
+        ../clp/type_utils.hpp
+        ../clp/utf8_utils.cpp
+        ../clp/utf8_utils.hpp
 )
 
 set(
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 95e788e6d..ffa0d840b 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -1,6 +1,5 @@
 #include "JsonParser.hpp"
 
-#include <fstream>
 #include <iostream>
 #include <stack>
 
@@ -522,170 +521,109 @@ bool JsonParser::parse() {
 }
 
 NodeType get_archive_node_type(
-        clp::ffi::SchemaTreeNode const& node,
-        std::pair<clp::ffi::SchemaTreeNode::id_t, std::optional<clp::ffi::Value>> p
+        clp::ffi::SchemaTreeNode::Type ir_node_type,
+        bool node_has_value,
+        std::optional<clp::ffi::Value> const& node_value
 ) {
-    auto const node_type = node.get_type();
     // figure out what type the node is in archive node type
-    NodeType archiveNodeType;
-    switch (node_type) {
+    NodeType archive_node_type;
+    switch (ir_node_type) {
         case clp::ffi::SchemaTreeNode::Type::Int:
-            archiveNodeType = NodeType::Integer;
+            archive_node_type = NodeType::Integer;
             break;
         case clp::ffi::SchemaTreeNode::Type::Float:
-            archiveNodeType = NodeType::Float;
+            archive_node_type = NodeType::Float;
             break;
         case clp::ffi::SchemaTreeNode::Type::Bool:
-            archiveNodeType = NodeType::Boolean;
+            archive_node_type = NodeType::Boolean;
             break;
         case clp::ffi::SchemaTreeNode::Type::UnstructuredArray:
-            archiveNodeType = NodeType::UnstructuredArray;
+            archive_node_type = NodeType::UnstructuredArray;
             break;
         case clp::ffi::SchemaTreeNode::Type::Str:
-            // std::cerr << "In str\n";
-            if (p.second.value().is<std::string>()) {
-                // maybe special case for date string
-                archiveNodeType = NodeType::VarString;
+            if (node_value->is<std::string>()) {
+                archive_node_type = NodeType::VarString;
             } else {
-                archiveNodeType = NodeType::ClpString;
+                archive_node_type = NodeType::ClpString;
             }
             break;
         case clp::ffi::SchemaTreeNode::Type::Obj:
-            // std::cerr << "In obj\n";
-            if (p.second.has_value()) {
-                if (p.second.value().is_null()) {
-                    // std::cout << "Found Null\n";
-                    archiveNodeType = NodeType::NullValue;
+            if (node_has_value) {
+                if (node_value->is_null()) {
+                    archive_node_type = NodeType::NullValue;
                 } else {
-                    archiveNodeType = NodeType::Object;
+                    archive_node_type = NodeType::Object;
                 }
             } else {
-                archiveNodeType = NodeType::Object;
+                archive_node_type = NodeType::Object;
             }
             break;
         default:
-            archiveNodeType = NodeType::Unknown;
+            archive_node_type = NodeType::Unknown;
             break;
     }
-    return archiveNodeType;
+    return archive_node_type;
 }
 
 //
 int JsonParser::get_archive_node_id(
-        std::map<std::tuple<int, NodeType>, int>& cache,
-        int irNodeID,
-        NodeType archiveNodeType,
-        clp::ffi::SchemaTree const& irTree
+        std::map<std::tuple<int32_t, NodeType>, int32_t>& ir_node_to_archive_node_map,
+        int ir_node_id,
+        NodeType archive_node_type,
+        clp::ffi::SchemaTree const& ir_tree
 ) {
-    std::tuple<int, NodeType> key(irNodeID, archiveNodeType);
-    if (cache.find(key) != cache.end()) {
-        return cache[key];
+    auto key = std::make_tuple(ir_node_id, archive_node_type);
+    auto map_location = ir_node_to_archive_node_map.find(key);
+    if (ir_node_to_archive_node_map.end() != map_location) {
+        return map_location->second;
     }
-    auto& currNode = irTree.get_node(irNodeID);
-    int parent_node_id;
-    // Found the root
-    if (currNode.get_parent_id() == 0) {
-        parent_node_id = 0;
-    } else {
-        parent_node_id
-                = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree);
+    auto& curr_node = ir_tree.get_node(ir_node_id);
+    int32_t parent_node_id{0};
+    if (0 != curr_node.get_parent_id()) {
+        parent_node_id = get_archive_node_id(
+                ir_node_to_archive_node_map,
+                curr_node.get_parent_id(),
+                NodeType::Object,
+                ir_tree
+        );
     }
-    std::string nodeKey
-            = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value();
-    int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey);
-    cache[key] = curr_node_archive_id;
-    return curr_node_archive_id;
-}
-
-void print_kv_log_event(KeyValuePairLogEvent const& kv) {
-    auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
-    std::cout << "number of kv pairs: " << num_kv_pairs << std::endl;
-    auto const& tree = kv.get_schema_tree();
-    for (auto const& pair : kv.get_node_id_value_pairs()) {
-        auto const& tree_node = tree.get_node(pair.first);
-        auto const node_type = tree_node.get_type();
-        switch (node_type) {
-            case clp::ffi::SchemaTreeNode::Type::Int:
-                std::cout << "Int" << std::endl;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Float:
-                std::cout << "Float" << std::endl;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Bool:
-                std::cout << "Bool" << std::endl;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Str:
-                std::cout << "Str" << std::endl;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::UnstructuredArray:
-                std::cout << "UArray" << std::endl;
-                break;
-            case clp::ffi::SchemaTreeNode::Type::Obj:
-                std::cout << "Obj" << std::endl;
-                break;
-            default:
-                std::cout << "???" << std::endl;
-                break;
-        }
-
-        if (!pair.second.has_value()) {
-            std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n";
-            continue;
-        }
-        if (pair.second.value().is<clp::ffi::value_int_t>()) {
-            std::cout << "{INT:\t" << pair.first << ": "
-                      << pair.second.value().get_immutable_view<clp::ffi::value_int_t>() << "}\n";
-        } else if (pair.second.value().is<clp::ffi::value_float_t>()) {
-            std::cout << "{FLOAT:\t" << pair.first << ": "
-                      << pair.second.value().get_immutable_view<clp::ffi::value_float_t>() << "}\n";
-        } else if (pair.second.value().is<clp::ffi::value_bool_t>()) {
-            std::cout << "{BOOL:\t" << pair.first << ": "
-                      << pair.second.value().get_immutable_view<clp::ffi::value_bool_t>() << "}\n";
-        } else if (pair.second.value().is<std::string>()) {
-            std::cout << "{STRING:\t" << pair.first << ": "
-                      << pair.second.value().get_immutable_view<std::string>() << "}\n";
-        } else if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
-            std::cout << "{EIGHTByte:\t" << pair.first << ": \n";
-            auto decoded = pair.second.value()
-                                   .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
-                                   .decode_and_unparse();
-            if (std::nullopt != decoded) {
-                std::cout << "\t Decoded & Unparsed: " << decoded.value() << std::endl;
-            } else {
-                std::cout << "\tNULL\n";
-            }
-            std::cout << "}\n";
-        } else if (pair.second.value().is<clp::ir::FourByteEncodedTextAst>()) {
-            std::cout << "{FOURByte:\t" << pair.first << ": \n";
-            auto decoded = pair.second.value()
-                                   .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
-                                   .decode_and_unparse();
-            if (std::nullopt != decoded) {
-                std::cout << "\tDecoded & Unparsed: " << decoded.value() << std::endl;
-            } else {
-                std::cout << "\tNULL\n";
-            }
-            std::cout << "}\n";
-        } else {
-            std::cout << "Unknown Type:\t" << pair.first << "\n";
-        }
+    auto validated_escaped_key
+            = clp::ffi::validate_and_escape_utf8_string(curr_node.get_key_name());
+    std::string node_key = "";
+    if (validated_escaped_key.has_value()) {
+        node_key = validated_escaped_key.value();
     }
-    std::cout << "after for loop\n\n\n";
+    int curr_node_archive_id
+            = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key);
+    ir_node_to_archive_node_map.emplace(std::move(key), curr_node_archive_id);
+    return curr_node_archive_id;
 }
 
 void JsonParser::parse_kv_log_event(
         KeyValuePairLogEvent const& kv,
-        std::map<std::tuple<int, NodeType>, int>& cache
+        std::map<std::tuple<int32_t, NodeType>, int32_t>& ir_node_to_archive_node_map
 ) {
-    auto const num_kv_pairs = kv.get_node_id_value_pairs().size();
     clp::ffi::SchemaTree const& tree = kv.get_schema_tree();
 
     for (auto const& pair : kv.get_node_id_value_pairs()) {
         clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first);
-        NodeType archiveNodeType = get_archive_node_type(tree_node, pair);
-        int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree);
+        clp::ffi::SchemaTreeNode::Type ir_node_type = tree_node.get_type();
+        bool node_has_value = pair.second.has_value();
+        NodeType archive_node_type = NodeType::Unknown;
+        if (node_has_value) {
+            archive_node_type
+                    = get_archive_node_type(ir_node_type, node_has_value, pair.second.value());
+        } else {
+            archive_node_type = get_archive_node_type(ir_node_type, node_has_value, {});
+        }
+        int node_id = get_archive_node_id(
+                ir_node_to_archive_node_map,
+                pair.first,
+                archive_node_type,
+                tree
+        );
 
-        switch (archiveNodeType) {
+        switch (archive_node_type) {
             case NodeType::Integer: {
                 int64_t i64_value = pair.second.value().get_immutable_view<clp::ffi::value_int_t>();
                 m_current_parsed_message.add_value(node_id, i64_value);
@@ -756,7 +694,7 @@ void JsonParser::parse_kv_log_event(
 }
 
 bool JsonParser::parse_from_IR() {
-    std::map<std::tuple<int, NodeType>, int> id_conversion_cache;
+    std::map<std::tuple<int32_t, NodeType>, int32_t> ir_node_to_archive_node_map;
     m_archive_writer->add_node(-1, NodeType::Unknown, "root");
 
     for (auto& file_path : m_file_paths) {
@@ -790,23 +728,20 @@ bool JsonParser::parse_from_IR() {
             m_current_schema.clear();
             auto const& kv_log_event = kv_log_event_result.value();
 
-            // print_kv_log_event(kv_log_event);
-            parse_kv_log_event(kv_log_event, id_conversion_cache);
+            parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map);
 
             m_num_messages++;
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
-                std::cerr << "Splitting Archive\n\n";
-                id_conversion_cache.clear();
+                ir_node_to_archive_node_map.clear();
                 m_archive_writer->add_node(-1, NodeType::Unknown, "root");
                 split_archive();
             }
 
             m_current_parsed_message.clear();
 
-        } while (1);
-        id_conversion_cache.clear();
+        } while (true);
+        ir_node_to_archive_node_map.clear();
         zd.close();
-        //infile.close();
     }
     return true;
 }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 62c54df8d..46538c176 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -18,6 +18,7 @@
 #include "../clp/ffi/Value.hpp"
 #include "../clp/GlobalMySQLMetadataDB.hpp"
 #include "../clp/ir/types.hpp"
+#include "../clp/streaming_compression/zstd/Decompressor.hpp"
 #include "../clp/type_utils.hpp"
 #include "ArchiveWriter.hpp"
 #include "DictionaryWriter.hpp"
@@ -31,7 +32,6 @@
 #include "TimestampDictionaryWriter.hpp"
 #include "Utils.hpp"
 #include "ZstdCompressor.hpp"
-#include "../clp/streaming_compression/zstd/Decompressor.hpp"
 
 using clp::BufferReader;
 using clp::ffi::ir_stream::Deserializer;
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 911e92c1d..813f314df 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -156,7 +156,6 @@ auto unpack_and_serialize_msgpack_bytes(
 
 template <typename T>
 auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
-    // std::cout << "Running Serializer\n";
     auto result{Serializer<T>::create()};
     if (result.has_error()) {
         SPDLOG_ERROR("Failed to create Serializer");
@@ -166,45 +165,43 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
     std::vector<int8_t> ir_buf;
     flush_and_clear_serializer_buffer(serializer, ir_buf);
 
-    std::ifstream inFile;
-    inFile.open(path, std::ifstream::in);
-    // std::cout << "Opened Input file\n";
+    std::ifstream in_file;
+    in_file.open(path, std::ifstream::in);
 
-    std::string outPath = "";
+    std::string out_path = "";
     int index = path.find_last_of('/');
     if (std::string::npos == index) {
-        outPath = option.irs_dir + "/" + path + ".ir";
+        out_path = option.irs_dir + "/" + path + ".ir";
     } else {
-        outPath = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir";
+        out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir";
     }
-    clp_s::FileWriter outFile;
-    // std::cout << outPath << "\n";
-    outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting);
+    clp_s::FileWriter out_file;
+    out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting);
     clp_s::ZstdCompressor zc;
-    zc.open(outFile, option.compression_level);
+    zc.open(out_file, option.compression_level);
 
     std::string line;
-    size_t totalSize = 0;
+    size_t total_size = 0;
 
-    if (inFile.is_open()) {
-        while (getline(inFile, line)) {
+    if (in_file.is_open()) {
+        while (getline(in_file, line)) {
             auto j_obj = nlohmann::json::parse(line);
             unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer);
             flush_and_clear_serializer_buffer(serializer, ir_buf);
             if (ir_buf.size() >= 1'000'000'000) {
-                totalSize = totalSize + ir_buf.size();
+                total_size = total_size + ir_buf.size();
                 zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
                 zc.flush();
                 ir_buf.clear();
             }
         }
-        totalSize = totalSize + ir_buf.size();
+        total_size = total_size + ir_buf.size();
         zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
         zc.flush();
         ir_buf.clear();
-        inFile.close();
+        in_file.close();
         zc.close();
-        outFile.close();
+        out_file.close();
     }
 
     return true;
@@ -223,11 +220,9 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) {
     clp_s::JsonToIRParserOption option{};
     option.file_paths = command_line_arguments.get_file_paths();
     option.irs_dir = irs_dir.string();
-    // std::cout << "IRs dir: " << option.irs_dir << std::endl;
     option.max_document_size = command_line_arguments.get_max_document_size();
     option.compression_level = command_line_arguments.get_compression_level();
     option.encoding = command_line_arguments.get_encoding_type();
-    // std::cout << "encoding type: " << static_cast<int>(option.encoding) << std::endl;
 
     if (false == clp_s::FileUtils::validate_path(option.file_paths)) {
         exit(1);
@@ -241,10 +236,8 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) {
     for (auto& path : all_file_paths) {
         bool success;
         if (option.encoding == 4) {
-            // std::cout << "four byte\n";
             success = run_serializer<int32_t>(option, path);
         } else {
-            // std::cout << "eight byte\n";
             success = run_serializer<int64_t>(option, path);
         }
         if (false == success) {
@@ -273,14 +266,10 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) {
     option.file_paths = command_line_arguments.get_file_paths();
     option.archives_dir = archives_dir.string();
     option.target_encoded_size = command_line_arguments.get_target_encoded_size();
-    // Do I need max_document_size()
     option.max_document_size = command_line_arguments.get_max_document_size();
     option.compression_level = command_line_arguments.get_compression_level();
     option.timestamp_key = command_line_arguments.get_timestamp_key();
     option.print_archive_stats = command_line_arguments.print_archive_stats();
-    // Is this an option they can make after IR or is that made before and has to be what is in the
-    // IR stream already option.structurize_arrays =
-    // command_line_arguments.get_structurize_arrays();
 
     auto const& db_config_container = command_line_arguments.get_metadata_db_config();
     if (db_config_container.has_value()) {

From 2a1b9291f1683b6b0251e56c8b495c1cfc63c0d5 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Wed, 25 Sep 2024 17:27:22 -0400
Subject: [PATCH 07/15] remove implicit root add

---
 components/core/src/clp_s/JsonParser.cpp | 8 ++++----
 components/core/src/clp_s/JsonParser.hpp | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index ffa0d840b..2bc652ccc 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -578,8 +578,8 @@ int JsonParser::get_archive_node_id(
         return map_location->second;
     }
     auto& curr_node = ir_tree.get_node(ir_node_id);
-    int32_t parent_node_id{0};
-    if (0 != curr_node.get_parent_id()) {
+    int32_t parent_node_id{-1};
+    if (ir_node_id != curr_node.get_parent_id()) {
         parent_node_id = get_archive_node_id(
                 ir_node_to_archive_node_map,
                 curr_node.get_parent_id(),
@@ -695,7 +695,7 @@ void JsonParser::parse_kv_log_event(
 
 bool JsonParser::parse_from_IR() {
     std::map<std::tuple<int32_t, NodeType>, int32_t> ir_node_to_archive_node_map;
-    m_archive_writer->add_node(-1, NodeType::Unknown, "root");
+    //m_archive_writer->add_node(-1, NodeType::Unknown, "root");
 
     for (auto& file_path : m_file_paths) {
         int fsize = std::filesystem::file_size(file_path);
@@ -733,7 +733,7 @@ bool JsonParser::parse_from_IR() {
             m_num_messages++;
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
                 ir_node_to_archive_node_map.clear();
-                m_archive_writer->add_node(-1, NodeType::Unknown, "root");
+                //m_archive_writer->add_node(-1, NodeType::Unknown, "root");
                 split_archive();
             }
 
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 46538c176..b48d1bbcd 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -86,7 +86,7 @@ class JsonParser {
 
     /**
      * Parses the Key Value IR Stream and stores the data in the archive.
-     * @return whether the IR Stream was parsed succesfully
+     * @return whether the IR Stream was parsed successfully
      */
     [[nodiscard]] bool parse_from_IR();
 

From f5005ecb53f0f758a121e805757ccd986ed48618 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Fri, 27 Sep 2024 19:24:26 -0400
Subject: [PATCH 08/15] Added more error handling and handled most of the
 remaining comments

---
 components/core/src/clp_s/CMakeLists.txt |  4 --
 components/core/src/clp_s/JsonParser.cpp | 83 +++++++++++++++---------
 components/core/src/clp_s/clp-s.cpp      | 58 ++++++++++++-----
 3 files changed, 93 insertions(+), 52 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 85a93afbb..477736362 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -23,8 +23,6 @@ set(
         ../clp/ReadOnlyMemoryMappedFile.cpp
         ../clp/ReadOnlyMemoryMappedFile.hpp
         ../clp/ReaderInterface.cpp
-        ../clp/ReaderInterface.cpp
-        ../clp/ReaderInterface.hpp
         ../clp/ReaderInterface.hpp
         ../clp/TraceableException.hpp
         ../clp/WriterInterface.cpp
@@ -39,7 +37,6 @@ set(
         ../clp/ffi/SchemaTree.hpp
         ../clp/ffi/SchemaTreeNode.hpp
         ../clp/ffi/Value.hpp
-        ../clp/ffi/Value.hpp
         ../clp/ffi/ir_stream/Deserializer.cpp
         ../clp/ffi/ir_stream/Deserializer.hpp
         ../clp/ffi/ir_stream/Serializer.cpp
@@ -66,7 +63,6 @@ set(
         ../clp/streaming_compression/zstd/Decompressor.hpp
         ../clp/time_types.hpp
         ../clp/type_utils.hpp
-        ../clp/type_utils.hpp
         ../clp/utf8_utils.cpp
         ../clp/utf8_utils.hpp
 )
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 2bc652ccc..32e078f6f 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -541,7 +541,7 @@ NodeType get_archive_node_type(
             archive_node_type = NodeType::UnstructuredArray;
             break;
         case clp::ffi::SchemaTreeNode::Type::Str:
-            if (node_value->is<std::string>()) {
+            if (node_value && node_value->is<std::string>()) {
                 archive_node_type = NodeType::VarString;
             } else {
                 archive_node_type = NodeType::ClpString;
@@ -592,6 +592,8 @@ int JsonParser::get_archive_node_id(
     std::string node_key = "";
     if (validated_escaped_key.has_value()) {
         node_key = validated_escaped_key.value();
+    } else {
+        throw "Key is not utf8 compliant";
     }
     int curr_node_archive_id
             = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key);
@@ -616,12 +618,17 @@ void JsonParser::parse_kv_log_event(
         } else {
             archive_node_type = get_archive_node_type(ir_node_type, node_has_value, {});
         }
-        int node_id = get_archive_node_id(
-                ir_node_to_archive_node_map,
-                pair.first,
-                archive_node_type,
-                tree
-        );
+        int node_id;
+        try {
+            node_id = get_archive_node_id(
+                    ir_node_to_archive_node_map,
+                    pair.first,
+                    archive_node_type,
+                    tree
+            );
+        } catch (...) {
+            throw;
+        }
 
         switch (archive_node_type) {
             case NodeType::Integer: {
@@ -637,30 +644,38 @@ void JsonParser::parse_kv_log_event(
                 m_current_parsed_message.add_value(node_id, b_value);
             } break;
             case NodeType::VarString: {
-                std::string str = clp::ffi::validate_and_escape_utf8_string(
-                                          pair.second.value().get_immutable_view<std::string>()
-                )
-                                          .value();
+                auto validated_escaped_string = clp::ffi::validate_and_escape_utf8_string(
+                        pair.second.value().get_immutable_view<std::string>()
+                );
+                std::string str = "";
+                if (validated_escaped_string.has_value()) {
+                    str = validated_escaped_string.value();
+                } else {
+                    throw "String is not utf8 compliant";
+                }
                 m_current_parsed_message.add_value(node_id, str);
             } break;
             case NodeType::ClpString: {
-                std::string encoded_str;
+                std::string encoded_str = "";
+                std::string decodedValue = "";
                 if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
-                    std::string decodedValue
-                            = pair.second.value()
-                                      .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
-                                      .decode_and_unparse()
-                                      .value();
-                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str())
-                                          .value();
+                    decodedValue = pair.second.value()
+                                           .get_immutable_view<clp::ir::EightByteEncodedTextAst>()
+                                           .decode_and_unparse()
+                                           .value();
+
                 } else {
-                    std::string decodedValue
-                            = pair.second.value()
-                                      .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
-                                      .decode_and_unparse()
-                                      .value();
-                    encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str())
-                                          .value();
+                    decodedValue = pair.second.value()
+                                           .get_immutable_view<clp::ir::FourByteEncodedTextAst>()
+                                           .decode_and_unparse()
+                                           .value();
+                }
+                auto validated_escaped_encoded_string
+                        = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str());
+                if (validated_escaped_encoded_string.has_value()) {
+                    encoded_str = validated_escaped_encoded_string.value();
+                } else {
+                    throw "Encoded string is not utf8 compliant";
                 }
                 m_current_parsed_message.add_value(node_id, encoded_str);
             } break;
@@ -695,7 +710,6 @@ void JsonParser::parse_kv_log_event(
 
 bool JsonParser::parse_from_IR() {
     std::map<std::tuple<int32_t, NodeType>, int32_t> ir_node_to_archive_node_map;
-    //m_archive_writer->add_node(-1, NodeType::Unknown, "root");
 
     for (auto& file_path : m_file_paths) {
         int fsize = std::filesystem::file_size(file_path);
@@ -727,13 +741,20 @@ bool JsonParser::parse_from_IR() {
 
             m_current_schema.clear();
             auto const& kv_log_event = kv_log_event_result.value();
-
-            parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map);
-
+            try {
+                parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map);
+            } catch (std::string msg) {
+                SPDLOG_ERROR("ERROR: {}" + msg);
+                zd.close();
+                return false;
+            } catch (...) {
+                SPDLOG_ERROR("ERROR: Encountered error while parsing a kv log event");
+                zd.close();
+                return false;
+            }
             m_num_messages++;
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
                 ir_node_to_archive_node_map.clear();
-                //m_archive_writer->add_node(-1, NodeType::Unknown, "root");
                 split_archive();
             }
 
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 813f314df..a4bf62825 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -143,15 +143,20 @@ auto unpack_and_serialize_msgpack_bytes(
         std::vector<uint8_t> const& msgpack_bytes,
         Serializer<encoded_variable_t>& serializer
 ) -> bool {
-    auto const msgpack_obj_handle{msgpack::unpack(
-            clp::size_checked_pointer_cast<char const>(msgpack_bytes.data()),
-            msgpack_bytes.size()
-    )};
-    auto const msgpack_obj{msgpack_obj_handle.get()};
-    if (msgpack::type::MAP != msgpack_obj.type) {
+    try {
+        auto const msgpack_obj_handle{msgpack::unpack(
+                clp::size_checked_pointer_cast<char const>(msgpack_bytes.data()),
+                msgpack_bytes.size()
+        )};
+        auto const msgpack_obj{msgpack_obj_handle.get()};
+        if (msgpack::type::MAP != msgpack_obj.type) {
+            return false;
+        }
+        return serializer.serialize_msgpack_map(msgpack_obj.via.map);
+    } catch (std::exception const& e) {
+        SPDLOG_ERROR("Failed to unpack msgpack bytes: {}", e.what());
         return false;
     }
-    return serializer.serialize_msgpack_map(msgpack_obj.via.map);
 }
 
 template <typename T>
@@ -168,13 +173,17 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
     std::ifstream in_file;
     in_file.open(path, std::ifstream::in);
 
-    std::string out_path = "";
+    /* std::string out_path = "";
     int index = path.find_last_of('/');
     if (std::string::npos == index) {
         out_path = option.irs_dir + "/" + path + ".ir";
     } else {
         out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir";
-    }
+    } */
+    std::filesystem::path input_path{path};
+    std::string filename = input_path.filename().string();
+    std::string out_path = option.irs_dir + "/" + filename + ".ir";
+
     clp_s::FileWriter out_file;
     out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting);
     clp_s::ZstdCompressor zc;
@@ -185,14 +194,29 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
 
     if (in_file.is_open()) {
         while (getline(in_file, line)) {
-            auto j_obj = nlohmann::json::parse(line);
-            unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer);
-            flush_and_clear_serializer_buffer(serializer, ir_buf);
-            if (ir_buf.size() >= 1'000'000'000) {
-                total_size = total_size + ir_buf.size();
-                zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
-                zc.flush();
-                ir_buf.clear();
+            try {
+                auto j_obj = nlohmann::json::parse(line);
+                if (!unpack_and_serialize_msgpack_bytes(
+                            nlohmann::json::to_msgpack(j_obj),
+                            serializer
+                    ))
+                {
+                    SPDLOG_ERROR("Failed to serialize msgpack bytes for line: {}", line);
+                    return false;
+                }
+                flush_and_clear_serializer_buffer(serializer, ir_buf);
+                if (ir_buf.size() >= 1'000'000'000) {
+                    total_size = total_size + ir_buf.size();
+                    zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
+                    zc.flush();
+                    ir_buf.clear();
+                }
+            } catch (nlohmann::json::parse_error const& e) {
+                SPDLOG_ERROR("JSON parsing error: {}", e.what());
+                return false;
+            } catch (std::exception const& e) {
+                SPDLOG_ERROR("Error during serialization: {}", e.what());
+                return false;
             }
         }
         total_size = total_size + ir_buf.size();

From 5c2866881ae72c6e3911f26955ac71c654d223ed Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Thu, 3 Oct 2024 10:31:23 -0400
Subject: [PATCH 09/15] modifications requested by coderabbitai

---
 components/core/src/clp_s/CMakeLists.txt |  1 -
 components/core/src/clp_s/clp-s.cpp      | 13 +++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 477736362..f697db731 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -6,7 +6,6 @@ set(
         ../clp/BufferReader.hpp
         ../clp/Defs.h
         ../clp/ErrorCode.hpp
-        ../clp/ErrorCode.hpp
         ../clp/FileDescriptor.cpp
         ../clp/FileDescriptor.hpp
         ../clp/GlobalMetadataDB.hpp
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index a4bf62825..7bd1519a9 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -58,7 +58,7 @@ bool compress(CommandLineArguments const& command_line_arguments);
  * @param command_line_arguments
  * @return Whether compression was successful
  */
-bool IR_compress(CommandLineArguments const& command_line_arguments);
+bool ir_compress(CommandLineArguments const& command_line_arguments);
 
 /**
  * Decompresses the archive specified by the given JsonConstructorOption.
@@ -187,7 +187,12 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
     clp_s::FileWriter out_file;
     out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting);
     clp_s::ZstdCompressor zc;
-    zc.open(out_file, option.compression_level);
+    try {
+        zc.open(out_file, option.compression_level);
+    } catch (clp_s::ZstdCompressor::OperationFailed& error) {
+        SPDLOG_ERROR("Failed to open ZSTDcompressor - {}", error.what());
+        return false;
+    }
 
     std::string line;
     size_t total_size = 0;
@@ -271,7 +276,7 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) {
     return true;
 }
 
-bool IR_compress(CommandLineArguments const& command_line_arguments) {
+bool ir_compress(CommandLineArguments const& command_line_arguments) {
     auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
 
     // Create output directory in case it doesn't exist
@@ -465,7 +470,7 @@ int main(int argc, char const* argv[]) {
             return 1;
         }
     } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) {
-        if (false == IR_compress(command_line_arguments)) {
+        if (false == ir_compress(command_line_arguments)) {
             return 1;
         }
     } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) {

From 2facdd0f99c45193a6fa2f22bf91702d8b746bb8 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Thu, 3 Oct 2024 15:00:42 -0400
Subject: [PATCH 10/15] main merged into branch

---
 components/core/src/clp_s/CMakeLists.txt | 2 ++
 components/core/src/clp_s/clp-s.cpp      | 7 -------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index f697db731..cca256489 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -44,6 +44,8 @@ set(
         ../clp/ffi/ir_stream/decoding_methods.hpp
         ../clp/ffi/ir_stream/encoding_methods.cpp
         ../clp/ffi/ir_stream/encoding_methods.hpp
+        ../clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
+        ../clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp
         ../clp/ffi/ir_stream/protocol_constants.hpp
         ../clp/ffi/ir_stream/utils.cpp
         ../clp/ffi/ir_stream/utils.hpp
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 7bd1519a9..49bd148bb 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -173,13 +173,6 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
     std::ifstream in_file;
     in_file.open(path, std::ifstream::in);
 
-    /* std::string out_path = "";
-    int index = path.find_last_of('/');
-    if (std::string::npos == index) {
-        out_path = option.irs_dir + "/" + path + ".ir";
-    } else {
-        out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir";
-    } */
     std::filesystem::path input_path{path};
     std::string filename = input_path.filename().string();
     std::string out_path = option.irs_dir + "/" + filename + ".ir";

From 303d4cbff313f1c52e8325fecc483318cb96da4c Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 7 Oct 2024 15:28:57 -0400
Subject: [PATCH 11/15] map converted to unordered_map and various
 documentation, linting, and clang-tidying

---
 components/core/src/clp_s/CMakeLists.txt      | 44 +++++------
 .../core/src/clp_s/CommandLineArguments.cpp   | 28 +++----
 .../core/src/clp_s/CommandLineArguments.hpp   |  8 +-
 components/core/src/clp_s/JsonParser.cpp      | 78 ++++++++++++-------
 components/core/src/clp_s/JsonParser.hpp      | 66 ++++++++--------
 components/core/src/clp_s/clp-s.cpp           | 59 +++++++++++---
 6 files changed, 172 insertions(+), 111 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index cca256489..948a9d701 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -4,32 +4,12 @@ set(
         CLP_SOURCES
         ../clp/BufferReader.cpp
         ../clp/BufferReader.hpp
-        ../clp/Defs.h
-        ../clp/ErrorCode.hpp
-        ../clp/FileDescriptor.cpp
-        ../clp/FileDescriptor.hpp
-        ../clp/GlobalMetadataDB.hpp
-        ../clp/GlobalMetadataDBConfig.cpp
-        ../clp/GlobalMetadataDBConfig.hpp
-        ../clp/GlobalMySQLMetadataDB.cpp
-        ../clp/GlobalMySQLMetadataDB.hpp
-        ../clp/MySQLDB.cpp
-        ../clp/MySQLDB.hpp
-        ../clp/MySQLParamBindings.cpp
-        ../clp/MySQLParamBindings.hpp
-        ../clp/MySQLPreparedStatement.cpp
-        ../clp/MySQLPreparedStatement.hpp
-        ../clp/ReadOnlyMemoryMappedFile.cpp
-        ../clp/ReadOnlyMemoryMappedFile.hpp
-        ../clp/ReaderInterface.cpp
-        ../clp/ReaderInterface.hpp
-        ../clp/TraceableException.hpp
-        ../clp/WriterInterface.cpp
-        ../clp/WriterInterface.hpp
         ../clp/cli_utils.cpp
         ../clp/cli_utils.hpp
         ../clp/database_utils.cpp
         ../clp/database_utils.hpp
+        ../clp/Defs.h
+        ../clp/ErrorCode.hpp
         ../clp/ffi/KeyValuePairLogEvent.cpp
         ../clp/ffi/KeyValuePairLogEvent.hpp
         ../clp/ffi/SchemaTree.cpp
@@ -51,21 +31,41 @@ set(
         ../clp/ffi/ir_stream/utils.hpp
         ../clp/ffi/utils.cpp
         ../clp/ffi/utils.hpp
+        ../clp/FileDescriptor.cpp
+        ../clp/FileDescriptor.hpp
+        ../clp/GlobalMetadataDB.hpp
+        ../clp/GlobalMetadataDBConfig.cpp
+        ../clp/GlobalMetadataDBConfig.hpp
+        ../clp/GlobalMySQLMetadataDB.cpp
+        ../clp/GlobalMySQLMetadataDB.hpp
         ../clp/ir/EncodedTextAst.cpp
         ../clp/ir/EncodedTextAst.hpp
         ../clp/ir/parsing.cpp
         ../clp/ir/parsing.hpp
         ../clp/ir/types.hpp
+        ../clp/MySQLDB.cpp
+        ../clp/MySQLDB.hpp
+        ../clp/MySQLParamBindings.cpp
+        ../clp/MySQLParamBindings.hpp
+        ../clp/MySQLPreparedStatement.cpp
+        ../clp/MySQLPreparedStatement.hpp
         ../clp/networking/socket_utils.cpp
         ../clp/networking/socket_utils.hpp
+        ../clp/ReadOnlyMemoryMappedFile.cpp
+        ../clp/ReadOnlyMemoryMappedFile.hpp
+        ../clp/ReaderInterface.cpp
+        ../clp/ReaderInterface.hpp
         ../clp/streaming_archive/ArchiveMetadata.cpp
         ../clp/streaming_archive/ArchiveMetadata.hpp
         ../clp/streaming_compression/zstd/Decompressor.cpp
         ../clp/streaming_compression/zstd/Decompressor.hpp
         ../clp/time_types.hpp
+        ../clp/TraceableException.hpp
         ../clp/type_utils.hpp
         ../clp/utf8_utils.cpp
         ../clp/utf8_utils.hpp
+        ../clp/WriterInterface.cpp
+        ../clp/WriterInterface.hpp
 )
 
 set(
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 530dad3fb..e4e8a837f 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -129,8 +129,8 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             case (char)Command::Compress:
             case (char)Command::Extract:
             case (char)Command::Search:
-            case (char)Command::Json_To_IR:
-            case (char)Command::IR_Compress:
+            case (char)Command::JsonToIr:
+            case (char)Command::IrCompress:
                 m_command = (Command)command_input;
                 break;
             default:
@@ -270,7 +270,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 m_metadata_db_config = std::move(metadata_db_config);
             }
-        } else if (Command::IR_Compress == m_command) {
+        } else if (Command::IrCompress == m_command) {
             po::options_description compression_positional_options;
             // clang-format off
              compression_positional_options.add_options()(
@@ -348,11 +348,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             po::notify(parsed_command_line_options);
 
             if (parsed_command_line_options.count("help")) {
-                print_IR_compression_usage();
+                print_ir_compression_usage();
 
-                std::cerr << "Examples:" << std::endl;
-                std::cerr << "  # Compress file1.ir and dir1 into archives-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " i archives-dir file1.ir dir1" << std::endl;
+                std::cerr << "Examples:\n";
+                std::cerr << "  # Compress file1.ir and dir1 into archives-dir\n";
+                std::cerr << "  " << m_program_name << " i archives-dir file1.ir dir1\n";
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -398,7 +398,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
 
                 m_metadata_db_config = std::move(metadata_db_config);
             }
-        } else if ((char)Command::Json_To_IR == command_input) {
+        } else if ((char)Command::JsonToIr == command_input) {
             po::options_description compression_positional_options;
             // clang-format off
              compression_positional_options.add_options()(
@@ -466,11 +466,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             po::notify(parsed_command_line_options);
 
             if (parsed_command_line_options.count("help")) {
-                print_json_to_IR_usage();
+                print_json_to_ir_usage();
 
-                std::cerr << "Examples:" << std::endl;
-                std::cerr << "  # Parse file1.json and dir1 into irs-dir" << std::endl;
-                std::cerr << "  " << m_program_name << " r irs-dir file1.json dir1" << std::endl;
+                std::cerr << "Examples:\n";
+                std::cerr << "  # Parse file1.json and dir1 into irs-dir\n";
+                std::cerr << "  " << m_program_name << " r irs-dir file1.json dir1\n";
 
                 po::options_description visible_options;
                 visible_options.add(general_options);
@@ -1039,11 +1039,11 @@ void CommandLineArguments::print_search_usage() const {
               << std::endl;
 }
 
-void CommandLineArguments::print_json_to_IR_usage() const {
+void CommandLineArguments::print_json_to_ir_usage() const {
     std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl;
 }
 
-void CommandLineArguments::print_IR_compression_usage() const {
+void CommandLineArguments::print_ir_compression_usage() const {
     std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]"
               << std::endl;
 }
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index dedd3bd59..48cdb47d1 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -27,8 +27,8 @@ class CommandLineArguments {
         Compress = 'c',
         Extract = 'x',
         Search = 's',
-        Json_To_IR = 'r',
-        IR_Compress = 'i'
+        JsonToIr = 'r',
+        IrCompress = 'i'
     };
 
     enum class OutputHandlerType : uint8_t {
@@ -161,9 +161,9 @@ class CommandLineArguments {
 
     void print_search_usage() const;
 
-    void print_json_to_IR_usage() const;
+    void print_json_to_ir_usage() const;
 
-    void print_IR_compression_usage() const;
+    void print_ir_compression_usage() const;
 
     // Variables
     std::string m_program_name;
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 32e078f6f..0185f1305 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -1,15 +1,28 @@
 #include "JsonParser.hpp"
 
-#include <iostream>
+#include <cstdint>
+#include <cstdlib>
+#include <optional>
 #include <stack>
+#include <unordered_map>
 
 #include <simdjson.h>
 #include <spdlog/spdlog.h>
 
-#include "archive_constants.hpp"
+#include "../clp/ffi/SchemaTree.hpp"
+#include "../clp/ffi/SchemaTreeNode.hpp"
+#include "../clp/ffi/utils.hpp"
+#include "../clp/ffi/Value.hpp"
+#include "../clp/ir/types.hpp"
+#include "../clp/streaming_compression/zstd/Decompressor.hpp"
+#include "DictionaryWriter.hpp"
 #include "JsonFileIterator.hpp"
+#include "ParsedMessage.hpp"
+
+using namespace simdjson;
 
 namespace clp_s {
+
 JsonParser::JsonParser(JsonParserOption const& option)
         : m_num_messages(0),
           m_target_encoded_size(option.target_encoded_size),
@@ -520,13 +533,13 @@ bool JsonParser::parse() {
     return true;
 }
 
-NodeType get_archive_node_type(
+auto JsonParser::get_archive_node_type(
         clp::ffi::SchemaTreeNode::Type ir_node_type,
         bool node_has_value,
         std::optional<clp::ffi::Value> const& node_value
-) {
+) -> NodeType {
     // figure out what type the node is in archive node type
-    NodeType archive_node_type;
+    NodeType archive_node_type = NodeType::Unknown;
     switch (ir_node_type) {
         case clp::ffi::SchemaTreeNode::Type::Int:
             archive_node_type = NodeType::Integer;
@@ -559,29 +572,33 @@ NodeType get_archive_node_type(
             }
             break;
         default:
-            archive_node_type = NodeType::Unknown;
             break;
     }
     return archive_node_type;
 }
 
-//
-int JsonParser::get_archive_node_id(
-        std::map<std::tuple<int32_t, NodeType>, int32_t>& ir_node_to_archive_node_map,
-        int ir_node_id,
+auto JsonParser::get_archive_node_id(
+        std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
+                ir_node_to_archive_node_unordered_map,
+        int32_t ir_node_id,
         NodeType archive_node_type,
         clp::ffi::SchemaTree const& ir_tree
-) {
-    auto key = std::make_tuple(ir_node_id, archive_node_type);
-    auto map_location = ir_node_to_archive_node_map.find(key);
-    if (ir_node_to_archive_node_map.end() != map_location) {
-        return map_location->second;
+) -> int {
+    auto unordered_map_location = ir_node_to_archive_node_unordered_map.find(ir_node_id);
+    if (ir_node_to_archive_node_unordered_map.end() != unordered_map_location) {
+        auto translation_vector = unordered_map_location->second;
+        for (int i = 0; i < translation_vector.size(); i++) {
+            if (translation_vector[i].first == archive_node_type) {
+                return translation_vector[i].second;
+            }
+        }
     }
-    auto& curr_node = ir_tree.get_node(ir_node_id);
+
+    auto const& curr_node = ir_tree.get_node(ir_node_id);
     int32_t parent_node_id{-1};
     if (ir_node_id != curr_node.get_parent_id()) {
         parent_node_id = get_archive_node_id(
-                ir_node_to_archive_node_map,
+                ir_node_to_archive_node_unordered_map,
                 curr_node.get_parent_id(),
                 NodeType::Object,
                 ir_tree
@@ -597,16 +614,23 @@ int JsonParser::get_archive_node_id(
     }
     int curr_node_archive_id
             = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key);
-    ir_node_to_archive_node_map.emplace(std::move(key), curr_node_archive_id);
+    auto p = std::make_pair(archive_node_type, curr_node_archive_id);
+    if (ir_node_to_archive_node_unordered_map.end() != unordered_map_location) {
+        unordered_map_location->second.push_back(p);
+    } else {
+        std::vector<std::pair<NodeType, int32_t>> v;
+        v.push_back(p);
+        ir_node_to_archive_node_unordered_map.emplace(ir_node_id, v);
+    }
     return curr_node_archive_id;
 }
 
 void JsonParser::parse_kv_log_event(
         KeyValuePairLogEvent const& kv,
-        std::map<std::tuple<int32_t, NodeType>, int32_t>& ir_node_to_archive_node_map
+        std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
+                ir_node_to_archive_node_unordered_map
 ) {
     clp::ffi::SchemaTree const& tree = kv.get_schema_tree();
-
     for (auto const& pair : kv.get_node_id_value_pairs()) {
         clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first);
         clp::ffi::SchemaTreeNode::Type ir_node_type = tree_node.get_type();
@@ -621,7 +645,7 @@ void JsonParser::parse_kv_log_event(
         int node_id;
         try {
             node_id = get_archive_node_id(
-                    ir_node_to_archive_node_map,
+                    ir_node_to_archive_node_unordered_map,
                     pair.first,
                     archive_node_type,
                     tree
@@ -705,11 +729,11 @@ void JsonParser::parse_kv_log_event(
     int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema);
     m_current_parsed_message.set_id(current_schema_id);
     m_archive_writer->append_message(current_schema_id, m_current_schema, m_current_parsed_message);
-    return;
 }
 
-bool JsonParser::parse_from_IR() {
-    std::map<std::tuple<int32_t, NodeType>, int32_t> ir_node_to_archive_node_map;
+auto JsonParser::parse_from_ir() -> bool {
+    std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>
+            ir_node_to_archive_node_unordered_map;
 
     for (auto& file_path : m_file_paths) {
         int fsize = std::filesystem::file_size(file_path);
@@ -742,7 +766,7 @@ bool JsonParser::parse_from_IR() {
             m_current_schema.clear();
             auto const& kv_log_event = kv_log_event_result.value();
             try {
-                parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map);
+                parse_kv_log_event(kv_log_event, ir_node_to_archive_node_unordered_map);
             } catch (std::string msg) {
                 SPDLOG_ERROR("ERROR: {}" + msg);
                 zd.close();
@@ -754,14 +778,14 @@ bool JsonParser::parse_from_IR() {
             }
             m_num_messages++;
             if (m_archive_writer->get_data_size() >= m_target_encoded_size) {
-                ir_node_to_archive_node_map.clear();
+                ir_node_to_archive_node_unordered_map.clear();
                 split_archive();
             }
 
             m_current_parsed_message.clear();
 
         } while (true);
-        ir_node_to_archive_node_map.clear();
+        ir_node_to_archive_node_unordered_map.clear();
         zd.close();
     }
     return true;
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index b48d1bbcd..bd58869ed 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -1,45 +1,32 @@
 #ifndef CLP_S_JSONPARSER_HPP
 #define CLP_S_JSONPARSER_HPP
 
-#include <map>
+#include <cstdlib>
+#include <optional>
 #include <string>
+#include <unordered_map>
 #include <variant>
 #include <vector>
 
 #include <boost/uuid/random_generator.hpp>
-#include <simdjson.h>
 
 #include "../clp/BufferReader.hpp"
 #include "../clp/ffi/ir_stream/Deserializer.hpp"
 #include "../clp/ffi/KeyValuePairLogEvent.hpp"
 #include "../clp/ffi/SchemaTree.hpp"
 #include "../clp/ffi/SchemaTreeNode.hpp"
-#include "../clp/ffi/utils.hpp"
 #include "../clp/ffi/Value.hpp"
 #include "../clp/GlobalMySQLMetadataDB.hpp"
-#include "../clp/ir/types.hpp"
-#include "../clp/streaming_compression/zstd/Decompressor.hpp"
 #include "../clp/type_utils.hpp"
 #include "ArchiveWriter.hpp"
-#include "DictionaryWriter.hpp"
-#include "FileReader.hpp"
-#include "FileWriter.hpp"
 #include "ParsedMessage.hpp"
 #include "Schema.hpp"
-#include "SchemaMap.hpp"
-#include "SchemaTree.hpp"
-#include "SchemaWriter.hpp"
-#include "TimestampDictionaryWriter.hpp"
-#include "Utils.hpp"
-#include "ZstdCompressor.hpp"
 
 using clp::BufferReader;
 using clp::ffi::ir_stream::Deserializer;
 using clp::ffi::KeyValuePairLogEvent;
 using clp::size_checked_pointer_cast;
 
-using namespace simdjson;
-
 namespace clp_s {
 struct JsonParserOption {
     std::vector<std::string> file_paths;
@@ -88,7 +75,7 @@ class JsonParser {
      * Parses the Key Value IR Stream and stores the data in the archive.
      * @return whether the IR Stream was parsed successfully
      */
-    [[nodiscard]] bool parse_from_IR();
+    [[nodiscard]] auto parse_from_ir() -> bool;
 
     /**
      * Writes the metadata and archive data to disk.
@@ -106,29 +93,44 @@ class JsonParser {
     void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key);
 
     /**
-     * Parses a Key Value Log Event
-     * @param kv the key value log event
-     * @param cache cache of node id conversions between deserializer schema tree nodes and archive
-     * schema tree nodes
+     * Compresses the input files specified by the command line arguments into an archive.
+     * @param ir_node_type schema node type from the IR stream
+     * @param node_has_value Boolean that say whether or not the node has value.
+     * @param node_value The ir schema node value if the node has value
+     * @return The clp-s archive Node Type that shoudl be used for the archive node
      */
-    void parse_kv_log_event(
-            KeyValuePairLogEvent const& kv,
-            std::map<std::tuple<int, NodeType>, int>& cache
-    );
+    static auto get_archive_node_type(
+            clp::ffi::SchemaTreeNode::Type ir_node_type,
+            bool node_has_value,
+            std::optional<clp::ffi::Value> const& node_value
+    ) -> NodeType;
 
     /**
      * Get archive node id for ir node
-     * @param cache cache of node id conversions between deserializer schema tree nodes and archive
-     * schema tree nodes
+     * @param ir_node_to_archive_node_unordered_map cache of node id conversions between
+     * deserializer schema tree nodes and archive schema tree nodes
      * @param irNodeID
      * @param irType
      * @param irTree
      */
-    int get_archive_node_id(
-            std::map<std::tuple<int, NodeType>, int>& cache,
-            int irNodeID,
-            NodeType archiveNodeType,
-            clp::ffi::SchemaTree const& irTree
+    auto get_archive_node_id(
+            std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
+                    ir_node_to_archive_node_unordered_map,
+            int32_t ir_node_id,
+            NodeType archive_node_type,
+            clp::ffi::SchemaTree const& ir_tree
+    ) -> int;
+
+    /**
+     * Parses a Key Value Log Event
+     * @param kv the key value log event
+     * @param ir_node_to_archive_node_unordered_map cache of node id conversions between
+     * deserializer schema tree nodes and archive schema tree nodes
+     */
+    void parse_kv_log_event(
+            KeyValuePairLogEvent const& kv,
+            std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
+                    ir_node_to_archive_node_unordered_map
     );
 
     /**
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 49bd148bb..7bdada0a3 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -1,7 +1,7 @@
+#include <cstdlib>
 #include <exception>
 #include <filesystem>
 #include <fstream>
-#include <iostream>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -9,6 +9,7 @@
 
 #include <json/single_include/nlohmann/json.hpp>
 #include <mongocxx/instance.hpp>
+#include <msgpack.hpp>
 #include <spdlog/sinks/stdout_sinks.h>
 #include <spdlog/spdlog.h>
 
@@ -46,6 +47,9 @@ using clp_s::cEpochTimeMin;
 using clp_s::CommandLineArguments;
 
 namespace {
+
+size_t max_ir_buffer_size = 1'000'000'000;
+
 /**
  * Compresses the input files specified by the command line arguments into an archive.
  * @param command_line_arguments
@@ -53,12 +57,42 @@ namespace {
  */
 bool compress(CommandLineArguments const& command_line_arguments);
 
+template <typename encoded_variable_t>
+auto flush_and_clear_serializer_buffer(
+        Serializer<encoded_variable_t>& serializer,
+        std::vector<int8_t>& byte_buf
+) -> void;
+
+template <typename encoded_variable_t>
+auto unpack_and_serialize_msgpack_bytes(
+        std::vector<uint8_t> const& msgpack_bytes,
+        Serializer<encoded_variable_t>& serializer
+) -> bool;
+
+/**
+ * Given user specified options and a file path to a JSON file calls the serailizer one each JSON
+ * entry to serialize into IR
+ * @param option
+ * @param path
+ * @return Whether serialization was successful
+ */
+template <typename T>
+auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path);
+
+/**
+ * Iterates over the input JSON files specified by the command line arguments to generate and IR
+ * file for each one.
+ * @param command_line_arguments
+ * @return Whether generation was successful
+ */
+auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool;
+
 /**
  * Compresses the input IR files specified by the command line arguments into an archive.
  * @param command_line_arguments
  * @return Whether compression was successful
  */
-bool ir_compress(CommandLineArguments const& command_line_arguments);
+auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool;
 
 /**
  * Decompresses the archive specified by the given JsonConstructorOption.
@@ -160,7 +194,7 @@ auto unpack_and_serialize_msgpack_bytes(
 }
 
 template <typename T>
-auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
+auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) {
     auto result{Serializer<T>::create()};
     if (result.has_error()) {
         SPDLOG_ERROR("Failed to create Serializer");
@@ -187,14 +221,15 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
         return false;
     }
 
-    std::string line;
+    std::string line = "";
     size_t total_size = 0;
 
     if (in_file.is_open()) {
         while (getline(in_file, line)) {
             try {
                 auto j_obj = nlohmann::json::parse(line);
-                if (!unpack_and_serialize_msgpack_bytes(
+                if (false
+                    == unpack_and_serialize_msgpack_bytes(
                             nlohmann::json::to_msgpack(j_obj),
                             serializer
                     ))
@@ -203,7 +238,7 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
                     return false;
                 }
                 flush_and_clear_serializer_buffer(serializer, ir_buf);
-                if (ir_buf.size() >= 1'000'000'000) {
+                if (ir_buf.size() >= max_ir_buffer_size) {
                     total_size = total_size + ir_buf.size();
                     zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
                     zc.flush();
@@ -229,7 +264,7 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) {
     return true;
 }
 
-bool generate_IR(CommandLineArguments const& command_line_arguments) {
+auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool {
     auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
 
     // Create output directory in case it doesn't exist
@@ -269,7 +304,7 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) {
     return true;
 }
 
-bool ir_compress(CommandLineArguments const& command_line_arguments) {
+auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool {
     auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
 
     // Create output directory in case it doesn't exist
@@ -307,7 +342,7 @@ bool ir_compress(CommandLineArguments const& command_line_arguments) {
     }
 
     clp_s::JsonParser parser(option);
-    if (false == parser.parse_from_IR()) {
+    if (false == parser.parse_from_ir()) {
         SPDLOG_ERROR("Encountered error while parsing input");
         return false;
     }
@@ -462,12 +497,12 @@ int main(int argc, char const* argv[]) {
         if (false == compress(command_line_arguments)) {
             return 1;
         }
-    } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) {
+    } else if (CommandLineArguments::Command::IrCompress == command_line_arguments.get_command()) {
         if (false == ir_compress(command_line_arguments)) {
             return 1;
         }
-    } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) {
-        if (false == generate_IR(command_line_arguments)) {
+    } else if (CommandLineArguments::Command::JsonToIr == command_line_arguments.get_command()) {
+        if (false == generate_ir(command_line_arguments)) {
             return 1;
         }
     } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) {

From 96c3ef9eb2baac483ffadaca9ad2dc161aa364ef Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 7 Oct 2024 16:56:50 -0400
Subject: [PATCH 12/15] A bit of refactoring and corrections recommended by
 coderabitai

---
 .../core/src/clp_s/CommandLineArguments.cpp   | 10 ++++--
 .../core/src/clp_s/CommandLineArguments.hpp   |  5 ++-
 components/core/src/clp_s/JsonParser.cpp      |  1 +
 components/core/src/clp_s/JsonParser.hpp      | 16 +++++----
 components/core/src/clp_s/clp-s.cpp           | 35 ++++++++++++++-----
 5 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index e4e8a837f..6b6547fe6 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -426,6 +426,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     po::value<size_t>(&m_max_document_size)->value_name("DOC_SIZE")->
                         default_value(m_max_document_size),
                     "Maximum allowed size (B) for a single document before ir generation fails."
+            )(
+                    "max-ir-buffer-size",
+                    po::value<size_t>(&m_max_ir_buffer_size)->value_name("BUFFER_SIZE")->
+                        default_value(m_max_ir_buffer_size),
+                    "Maximum allowed size (B) for a in memory IR buffer befroe being written to file."
             )(
                     "encoding-type",
                     po::value<int>(&m_encoding_type)->value_name("ENCODING_TYPE")->
@@ -1040,11 +1045,10 @@ void CommandLineArguments::print_search_usage() const {
 }
 
 void CommandLineArguments::print_json_to_ir_usage() const {
-    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl;
+    std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]\n";
 }
 
 void CommandLineArguments::print_ir_compression_usage() const {
-    std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]"
-              << std::endl;
+    std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]\n";
 }
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index 48cdb47d1..91e0eecb1 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -62,7 +62,9 @@ class CommandLineArguments {
 
     size_t get_max_document_size() const { return m_max_document_size; }
 
-    int get_encoding_type() const { return m_encoding_type; }
+    [[nodiscard]] auto get_max_ir_buffer_size() const -> size_t { return m_max_ir_buffer_size; }
+
+    [[nodiscard]] auto get_encoding_type() const -> int { return m_encoding_type; }
 
     [[nodiscard]] bool print_archive_stats() const { return m_print_archive_stats; }
 
@@ -182,6 +184,7 @@ class CommandLineArguments {
     bool m_ordered_decompression{false};
     size_t m_ordered_chunk_size{0};
     int m_encoding_type{8};
+    size_t m_max_ir_buffer_size{512ULL * 1024 * 1024};
     // Metadata db variables
     std::optional<clp::GlobalMetadataDBConfig> m_metadata_db_config;
 
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 0185f1305..3caadcef8 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -18,6 +18,7 @@
 #include "DictionaryWriter.hpp"
 #include "JsonFileIterator.hpp"
 #include "ParsedMessage.hpp"
+#include "SchemaTree.hpp"
 
 using namespace simdjson;
 
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index bd58869ed..2629f3d0b 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -21,6 +21,7 @@
 #include "ArchiveWriter.hpp"
 #include "ParsedMessage.hpp"
 #include "Schema.hpp"
+#include "SchemaTree.hpp"
 
 using clp::BufferReader;
 using clp::ffi::ir_stream::Deserializer;
@@ -44,6 +45,7 @@ struct JsonToIRParserOption {
     std::vector<std::string> file_paths;
     std::string irs_dir;
     size_t max_document_size;
+    size_t max_ir_buffer_size;
     int compression_level;
     int encoding;
 };
@@ -93,11 +95,11 @@ class JsonParser {
     void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key);
 
     /**
-     * Compresses the input files specified by the command line arguments into an archive.
+     * Determines the archive node type based on the IR node type and value.
      * @param ir_node_type schema node type from the IR stream
-     * @param node_has_value Boolean that say whether or not the node has value.
-     * @param node_value The ir schema node value if the node has value
-     * @return The clp-s archive Node Type that shoudl be used for the archive node
+     * @param node_has_value Boolean that says whether or not the node has value.
+     * @param node_value The IR schema node value if the node has value
+     * @return The clp-s archive Node Type that should be used for the archive node
      */
     static auto get_archive_node_type(
             clp::ffi::SchemaTreeNode::Type ir_node_type,
@@ -109,9 +111,9 @@ class JsonParser {
      * Get archive node id for ir node
      * @param ir_node_to_archive_node_unordered_map cache of node id conversions between
      * deserializer schema tree nodes and archive schema tree nodes
-     * @param irNodeID
-     * @param irType
-     * @param irTree
+     * @param ir_node_id ID of the IR node
+     * @param archive_node_type Type of the archive node
+     * @param ir_treeThe IR schema tree
      */
     auto get_archive_node_id(
             std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 7bdada0a3..13fbf7885 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -48,8 +48,6 @@ using clp_s::CommandLineArguments;
 
 namespace {
 
-size_t max_ir_buffer_size = 1'000'000'000;
-
 /**
  * Compresses the input files specified by the command line arguments into an archive.
  * @param command_line_arguments
@@ -87,6 +85,17 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path)
  */
 auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool;
 
+/**
+ * Fill in JsonParserOption instance based on command line user input
+ * @param command_line_arguments
+ * @param option
+ * @return Whether setup was succesful
+ */
+auto setup_compression_options(
+        CommandLineArguments const& command_line_arguments,
+        clp_s::JsonParserOption& option
+) -> bool;
+
 /**
  * Compresses the input IR files specified by the command line arguments into an archive.
  * @param command_line_arguments
@@ -238,7 +247,7 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path)
                     return false;
                 }
                 flush_and_clear_serializer_buffer(serializer, ir_buf);
-                if (ir_buf.size() >= max_ir_buffer_size) {
+                if (ir_buf.size() >= option.max_ir_buffer_size) {
                     total_size = total_size + ir_buf.size();
                     zc.write(reinterpret_cast<char*>(ir_buf.data()), ir_buf.size());
                     zc.flush();
@@ -278,11 +287,13 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool {
     option.file_paths = command_line_arguments.get_file_paths();
     option.irs_dir = irs_dir.string();
     option.max_document_size = command_line_arguments.get_max_document_size();
+    option.max_ir_buffer_size = command_line_arguments.get_max_ir_buffer_size();
     option.compression_level = command_line_arguments.get_compression_level();
     option.encoding = command_line_arguments.get_encoding_type();
 
     if (false == clp_s::FileUtils::validate_path(option.file_paths)) {
-        exit(1);
+        SPDLOG_ERROR("Invalid file path(s) provided");
+        return false;
     }
 
     std::vector<std::string> all_file_paths;
@@ -304,9 +315,11 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool {
     return true;
 }
 
-auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool {
+auto setup_compression_options(
+        CommandLineArguments const& command_line_arguments,
+        clp_s::JsonParserOption& option
+) -> bool {
     auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir());
-
     // Create output directory in case it doesn't exist
     try {
         std::filesystem::create_directory(archives_dir.string());
@@ -318,8 +331,6 @@ auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool {
         );
         return false;
     }
-
-    clp_s::JsonParserOption option{};
     option.file_paths = command_line_arguments.get_file_paths();
     option.archives_dir = archives_dir.string();
     option.target_encoded_size = command_line_arguments.get_target_encoded_size();
@@ -340,6 +351,14 @@ auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool {
                 db_config.get_metadata_table_prefix()
         );
     }
+    return true;
+}
+
+auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool {
+    clp_s::JsonParserOption option{};
+    if (false == setup_compression_options(command_line_arguments, option)) {
+        return false;
+    }
 
     clp_s::JsonParser parser(option);
     if (false == parser.parse_from_ir()) {

From 923b6427760af5ca84dd2b9cb356adbbd656035d Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 7 Oct 2024 17:25:24 -0400
Subject: [PATCH 13/15] More graceful error handlign and some typo fixes

---
 components/core/src/clp_s/CommandLineArguments.cpp |  2 +-
 components/core/src/clp_s/JsonParser.cpp           |  3 ++-
 components/core/src/clp_s/JsonParser.hpp           |  2 +-
 components/core/src/clp_s/clp-s.cpp                | 11 +++++++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 6b6547fe6..0b31af3ab 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -430,7 +430,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "max-ir-buffer-size",
                     po::value<size_t>(&m_max_ir_buffer_size)->value_name("BUFFER_SIZE")->
                         default_value(m_max_ir_buffer_size),
-                    "Maximum allowed size (B) for a in memory IR buffer befroe being written to file."
+                    "Maximum allowed size (B) for an in memory IR buffer befroe being written to file."
             )(
                     "encoding-type",
                     po::value<int>(&m_encoding_type)->value_name("ENCODING_TYPE")->
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 3caadcef8..eef54019d 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -611,7 +611,7 @@ auto JsonParser::get_archive_node_id(
     if (validated_escaped_key.has_value()) {
         node_key = validated_escaped_key.value();
     } else {
-        throw "Key is not utf8 compliant";
+        throw "Key is not UTF-8 compliant";
     }
     int curr_node_archive_id
             = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key);
@@ -747,6 +747,7 @@ auto JsonParser::parse_from_ir() -> bool {
 
         auto deserializer_result = Deserializer::create(zd);
         if (deserializer_result.has_error()) {
+            zd.close();
             m_archive_writer->close();
             return false;
         }
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 2629f3d0b..1ce4ac83e 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -113,7 +113,7 @@ class JsonParser {
      * deserializer schema tree nodes and archive schema tree nodes
      * @param ir_node_id ID of the IR node
      * @param archive_node_type Type of the archive node
-     * @param ir_treeThe IR schema tree
+     * @param ir_tree The IR schema tree
      */
     auto get_archive_node_id(
             std::unordered_map<int32_t, std::vector<std::pair<NodeType, int32_t>>>&
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 13fbf7885..8043890a5 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -227,6 +227,8 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path)
         zc.open(out_file, option.compression_level);
     } catch (clp_s::ZstdCompressor::OperationFailed& error) {
         SPDLOG_ERROR("Failed to open ZSTDcompressor - {}", error.what());
+        in_file.close();
+        out_file.close();
         return false;
     }
 
@@ -244,6 +246,9 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path)
                     ))
                 {
                     SPDLOG_ERROR("Failed to serialize msgpack bytes for line: {}", line);
+                    in_file.close();
+                    out_file.close();
+                    zc.close();
                     return false;
                 }
                 flush_and_clear_serializer_buffer(serializer, ir_buf);
@@ -255,9 +260,15 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path)
                 }
             } catch (nlohmann::json::parse_error const& e) {
                 SPDLOG_ERROR("JSON parsing error: {}", e.what());
+                in_file.close();
+                out_file.close();
+                zc.close();
                 return false;
             } catch (std::exception const& e) {
                 SPDLOG_ERROR("Error during serialization: {}", e.what());
+                in_file.close();
+                out_file.close();
+                zc.close();
                 return false;
             }
         }

From 36024d8c7c2a38ca48261e9d5e6f617d8eeea631 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 7 Oct 2024 17:43:22 -0400
Subject: [PATCH 14/15] Small acronym capitalization fix

---
 components/core/src/clp_s/JsonParser.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 1ce4ac83e..0e08a39cb 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -41,7 +41,7 @@ struct JsonParserOption {
     std::shared_ptr<clp::GlobalMySQLMetadataDB> metadata_db;
 };
 
-struct JsonToIRParserOption {
+struct JsonToIrParserOption {
     std::vector<std::string> file_paths;
     std::string irs_dir;
     size_t max_document_size;

From f76863e12fc889b5d3100b1b9984ad2a81b0727e Mon Sep 17 00:00:00 2001
From: Abigail Matthews <matthea@clarkson.edu>
Date: Mon, 7 Oct 2024 18:22:57 -0400
Subject: [PATCH 15/15] Fix broken struct name change propagation

---
 components/core/src/clp_s/JsonParser.hpp | 2 +-
 components/core/src/clp_s/clp-s.cpp      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 0e08a39cb..956373ae2 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -62,7 +62,7 @@ class JsonParser {
     // Constructor
     explicit JsonParser(JsonParserOption const& option);
 
-    JsonParser(JsonToIRParserOption const& option);
+    JsonParser(JsonToIrParserOption const& option);
 
     // Destructor
     ~JsonParser() = default;
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 8043890a5..554422adf 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -75,7 +75,7 @@ auto unpack_and_serialize_msgpack_bytes(
  * @return Whether serialization was successful
  */
 template <typename T>
-auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path);
+auto run_serializer(clp_s::JsonToIrParserOption const& option, std::string path);
 
 /**
  * Iterates over the input JSON files specified by the command line arguments to generate and IR
@@ -203,7 +203,7 @@ auto unpack_and_serialize_msgpack_bytes(
 }
 
 template <typename T>
-auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) {
+auto run_serializer(clp_s::JsonToIrParserOption const& option, std::string path) {
     auto result{Serializer<T>::create()};
     if (result.has_error()) {
         SPDLOG_ERROR("Failed to create Serializer");
@@ -294,7 +294,7 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool {
         SPDLOG_ERROR("Failed to create archives directory {} - {}", irs_dir.string(), e.what());
         return false;
     }
-    clp_s::JsonToIRParserOption option{};
+    clp_s::JsonToIrParserOption option{};
     option.file_paths = command_line_arguments.get_file_paths();
     option.irs_dir = irs_dir.string();
     option.max_document_size = command_line_arguments.get_max_document_size();