From 9506088c0bd8db92ab69a31dea499c2bdc786290 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 16 Sep 2024 11:52:14 -0400 Subject: [PATCH 01/15] ir->archive->json working --- components/core/CMakeLists.txt | 6 + components/core/src/clp_s/CMakeLists.txt | 28 ++ .../core/src/clp_s/CommandLineArguments.cpp | 272 +++++++++++++- .../core/src/clp_s/CommandLineArguments.hpp | 8 +- components/core/src/clp_s/JsonParser.cpp | 332 ++++++++++++++++++ components/core/src/clp_s/JsonParser.hpp | 51 +++ components/core/src/clp_s/clp-s.cpp | 68 +++- 7 files changed, 761 insertions(+), 4 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index c4f84570c..2be40ac75 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -266,6 +266,12 @@ set(SOURCE_FILES_clp_s_unitTest src/clp_s/TimestampPattern.hpp src/clp_s/Utils.cpp src/clp_s/Utils.hpp + src/clp_s/ZstdCompressor.hpp + src/clp_s/ZstdCompressor.cpp + src/clp_s/ZstdDecompressor.hpp + src/clp_s/ZstdDecompressor.cpp + src/clp_s/FileWriter.cpp + src/clp_s/FileReader.cpp ) set(SOURCE_FILES_unitTest diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index c8cf08b22..456f53c20 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -28,6 +28,34 @@ set( ../clp/TraceableException.hpp ../clp/WriterInterface.cpp ../clp/WriterInterface.hpp + ../clp/ffi/ir_stream/Deserializer.hpp + ../clp/ffi/ir_stream/Deserializer.cpp + ../clp/BufferReader.hpp + ../clp/BufferReader.cpp + ../clp/type_utils.hpp + ../clp/ffi/Value.hpp + ../clp/ErrorCode.hpp + ../clp/ir/EncodedTextAst.hpp + ../clp/ir/EncodedTextAst.cpp + ../clp/ir/types.hpp + ../clp/ReaderInterface.hpp + ../clp/ReaderInterface.cpp + ../clp/time_types.hpp + ../clp/type_utils.hpp + ../clp/ffi/KeyValuePairLogEvent.hpp + ../clp/ffi/KeyValuePairLogEvent.cpp + ../clp/ffi/SchemaTree.hpp + ../clp/ffi/SchemaTree.cpp + ../clp/ffi/SchemaTreeNode.hpp + ../clp/ffi/Value.hpp + ../clp/ffi/ir_stream/decoding_methods.hpp + ../clp/ffi/ir_stream/protocol_constants.hpp + ../clp/ffi/ir_stream/utils.hpp + ../clp/ffi/ir_stream/decoding_methods.cpp + ../clp/ffi/utils.hpp + ../clp/ffi/utils.cpp + ../clp/utf8_utils.hpp + ../clp/utf8_utils.cpp ) set( diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 4cfe017ac..06c319057 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -106,11 +106,15 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { std::cerr << " c - compress" << std::endl; std::cerr << " x - decompress" << std::endl; std::cerr << " s - search" << std::endl; + std::cerr << " r - JSON to IR Format" << std::endl; + std::cerr << " i - compress IR format" << std::endl; std::cerr << std::endl; std::cerr << "Try " << " c --help OR" << " x --help OR" - << " s --help for command-specific details." << std::endl; + << " s --help OR" + << " r --help OR" + << " i --help for command-specific details." << std::endl; po::options_description visible_options; visible_options.add(general_options); @@ -125,6 +129,8 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { case (char)Command::Compress: case (char)Command::Extract: case (char)Command::Search: + case (char)Command::Json_To_IR: + case (char)Command::IR_Compress: m_command = (Command)command_input; break; default: @@ -264,7 +270,259 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { m_metadata_db_config = std::move(metadata_db_config); } - } else if ((char)Command::Extract == command_input) { + } else if (Command::IR_Compress == m_command) { + po::options_description compression_positional_options; + // clang-format off + compression_positional_options.add_options()( + "archives-dir", + po::value(&m_archives_dir)->value_name("DIR"), + "output directory" + )( + "input-paths", + po::value>(&m_file_paths)->value_name("PATHS"), + "input paths" + ); + // clang-format on + + po::options_description compression_options("Compression options"); + std::string metadata_db_config_file_path; + std::string input_path_list_file_path; + // clang-format off + compression_options.add_options()( + "compression-level", + po::value(&m_compression_level)->value_name("LEVEL")-> + default_value(m_compression_level), + "1 (fast/low compression) to 9 (slow/high compression)." + )( + "target-encoded-size", + po::value(&m_target_encoded_size)->value_name("TARGET_ENCODED_SIZE")-> + default_value(m_target_encoded_size), + "Target size (B) for the dictionaries and encoded messages before a new " + "archive is created." + )( + "max-document-size", + po::value(&m_max_document_size)->value_name("DOC_SIZE")-> + default_value(m_max_document_size), + "Maximum allowed size (B) for a single document before compression fails." + )( + "timestamp-key", + po::value(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")-> + default_value(m_timestamp_key), + "Path (e.g. x.y) for the field containing the log event's timestamp." + )( + "db-config-file", + po::value(&metadata_db_config_file_path)->value_name("FILE")-> + default_value(metadata_db_config_file_path), + "Global metadata DB YAML config" + )( + "files-from,f", + po::value(&input_path_list_file_path) + ->value_name("FILE") + ->default_value(input_path_list_file_path), + "Compress files specified in FILE" + )( + "print-archive-stats", + po::bool_switch(&m_print_archive_stats), + "Print statistics (json) about the archive after it's compressed." + )/*( + "structurize-arrays", + po::bool_switch(&m_structurize_arrays), + "Structurize arrays instead of compressing them as clp strings." + ) */; + // clang-format on + + po::positional_options_description positional_options; + positional_options.add("archives-dir", 1); + positional_options.add("input-paths", -1); + + po::options_description all_compression_options; + all_compression_options.add(compression_options); + all_compression_options.add(compression_positional_options); + + std::vector unrecognized_options + = po::collect_unrecognized(parsed.options, po::include_positional); + unrecognized_options.erase(unrecognized_options.begin()); + po::store( + po::command_line_parser(unrecognized_options) + .options(all_compression_options) + .positional(positional_options) + .run(), + parsed_command_line_options + ); + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("help")) { + print_IR_compression_usage(); + + /* std::cerr << "Examples:" << std::endl; + std::cerr << " # Compress file1.json and dir1 into archives-dir" << std::endl; + std::cerr << " " << m_program_name << " c archives-dir file1.json dir1" + << std::endl; */ + + po::options_description visible_options; + visible_options.add(general_options); + visible_options.add(compression_options); + std::cerr << visible_options << '\n'; + return ParsingResult::InfoCommand; + } + + if (m_archives_dir.empty()) { + throw std::invalid_argument("No archives directory specified."); + } + + if (false == input_path_list_file_path.empty()) { + if (false == read_paths_from_file(input_path_list_file_path, m_file_paths)) { + SPDLOG_ERROR("Failed to read paths from {}", input_path_list_file_path); + return ParsingResult::Failure; + } + } + + if (m_file_paths.empty()) { + throw std::invalid_argument("No input paths specified."); + } + + // Parse and validate global metadata DB config + if (false == metadata_db_config_file_path.empty()) { + clp::GlobalMetadataDBConfig metadata_db_config; + try { + metadata_db_config.parse_config_file(metadata_db_config_file_path); + } catch (std::exception& e) { + SPDLOG_ERROR("Failed to validate metadata database config - {}.", e.what()); + return ParsingResult::Failure; + } + + if (clp::GlobalMetadataDBConfig::MetadataDBType::MySQL + != metadata_db_config.get_metadata_db_type()) + { + SPDLOG_ERROR( + "Invalid metadata database type for {}; only supported type is MySQL.", + m_program_name + ); + return ParsingResult::Failure; + } + + m_metadata_db_config = std::move(metadata_db_config); + } + }else if ((char)Command::Json_To_IR == command_input) { + po::options_description compression_positional_options; + // clang-format off + compression_positional_options.add_options()( + "ir-dir", + po::value(&m_archives_dir)->value_name("DIR"), + "output directory" + )( + "input-paths", + po::value>(&m_file_paths)->value_name("PATHS"), + "input paths" + ); + // clang-format on + + po::options_description compression_options("Compression options"); + std::string metadata_db_config_file_path; + std::string input_path_list_file_path; + // clang-format off + compression_options.add_options()( + "compression-level", + po::value(&m_compression_level)->value_name("LEVEL")-> + default_value(m_compression_level), + "1 (fast/low compression) to 9 (slow/high compression)." + )( + "max-document-size", + po::value(&m_max_document_size)->value_name("DOC_SIZE")-> + default_value(m_max_document_size), + "Maximum allowed size (B) for a single document before ir generation fails." + )( + "timestamp-key", + po::value(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")-> + default_value(m_timestamp_key), + "Path (e.g. x.y) for the field containing the log event's timestamp." + )( + "db-config-file", + po::value(&metadata_db_config_file_path)->value_name("FILE")-> + default_value(metadata_db_config_file_path), + "Global metadata DB YAML config" + )( + "files-from,f", + po::value(&input_path_list_file_path) + ->value_name("FILE") + ->default_value(input_path_list_file_path), + "Compress files specified in FILE" + ); + // clang-format on + + po::positional_options_description positional_options; + positional_options.add("ir-dir", 1); + positional_options.add("input-paths", -1); + + po::options_description all_compression_options; + all_compression_options.add(compression_options); + all_compression_options.add(compression_positional_options); + + std::vector unrecognized_options + = po::collect_unrecognized(parsed.options, po::include_positional); + unrecognized_options.erase(unrecognized_options.begin()); + po::store( + po::command_line_parser(unrecognized_options) + .options(all_compression_options) + .positional(positional_options) + .run(), + parsed_command_line_options + ); + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("help")) { + print_json_to_IR_usage(); + + /* std::cerr << "Examples:" << std::endl; + std::cerr << " # Compress file1.json and dir1 into archives-dir" << std::endl; + std::cerr << " " << m_program_name << " c archives-dir file1.json dir1" + << std::endl; */ + + po::options_description visible_options; + visible_options.add(general_options); + visible_options.add(compression_options); + std::cerr << visible_options << '\n'; + return ParsingResult::InfoCommand; + } + + if (m_archives_dir.empty()) { + throw std::invalid_argument("No IRs directory specified."); + } + + if (false == input_path_list_file_path.empty()) { + if (false == read_paths_from_file(input_path_list_file_path, m_file_paths)) { + SPDLOG_ERROR("Failed to read paths from {}", input_path_list_file_path); + return ParsingResult::Failure; + } + } + + if (m_file_paths.empty()) { + throw std::invalid_argument("No input paths specified."); + } + + // Parse and validate global metadata DB config + if (false == metadata_db_config_file_path.empty()) { + clp::GlobalMetadataDBConfig metadata_db_config; + try { + metadata_db_config.parse_config_file(metadata_db_config_file_path); + } catch (std::exception& e) { + SPDLOG_ERROR("Failed to validate metadata database config - {}.", e.what()); + return ParsingResult::Failure; + } + + if (clp::GlobalMetadataDBConfig::MetadataDBType::MySQL + != metadata_db_config.get_metadata_db_type()) + { + SPDLOG_ERROR( + "Invalid metadata database type for {}; only supported type is MySQL.", + m_program_name + ); + return ParsingResult::Failure; + } + + m_metadata_db_config = std::move(metadata_db_config); + } + }else if ((char)Command::Extract == command_input) { po::options_description extraction_options; // clang-format off extraction_options.add_options()( @@ -786,4 +1044,14 @@ void CommandLineArguments::print_search_usage() const { " [OUTPUT_HANDLER [OUTPUT_HANDLER_OPTIONS]]" << std::endl; } + +void CommandLineArguments::print_json_to_IR_usage() const { + std::cerr << "Usage: " << m_program_name << " r [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" + << std::endl; +} + +void CommandLineArguments::print_IR_compression_usage() const { + std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" + << std::endl; +} } // namespace clp_s diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index 0f3d8c556..60c480987 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -26,7 +26,9 @@ class CommandLineArguments { enum class Command : char { Compress = 'c', Extract = 'x', - Search = 's' + Search = 's', + Json_To_IR = 'r', + IR_Compress = 'i' }; enum class OutputHandlerType : uint8_t { @@ -157,6 +159,10 @@ class CommandLineArguments { void print_search_usage() const; + void print_json_to_IR_usage() const; + + void print_IR_compression_usage() const; + // Variables std::string m_program_name; Command m_command; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index a68062958..a56387bfd 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,6 +1,7 @@ #include "JsonParser.hpp" #include +#include #include #include @@ -37,6 +38,26 @@ JsonParser::JsonParser(JsonParserOption const& option) m_archive_writer->open(m_archive_options); } +JsonParser::JsonParser(JsonToIRParserOption const& option) + : m_num_messages(0), + m_max_document_size(option.max_document_size) { + if (false == FileUtils::validate_path(option.file_paths)) { + exit(1); + } + + for (auto& file_path : option.file_paths) { + FileUtils::find_all_files(file_path, m_file_paths); + } + + m_archive_options.archives_dir = option.irs_dir; + m_archive_options.compression_level = option.compression_level; + //m_archive_options.print_archive_stats = option.print_archive_stats; + m_archive_options.id = m_generator(); + + m_archive_writer = std::make_unique(option.metadata_db); + m_archive_writer->open(m_archive_options); +} + void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_id) { ondemand::object_iterator it = line.begin(); if (it == line.end()) { @@ -520,6 +541,317 @@ bool JsonParser::parse() { return true; } +NodeType get_archive_node_type(clp::ffi::SchemaTreeNode const& node, std::pair> p){ + //std::cerr << "In get_archive_node_type\n"; + auto const node_type = node.get_type(); + //std::cerr << "got ir type\n"; + //figure out what type the node is in archive node type + NodeType archiveNodeType; + switch(node_type){ + case clp::ffi::SchemaTreeNode::Type::Int : + archiveNodeType = NodeType::Integer; + break; + case clp::ffi::SchemaTreeNode::Type::Float : + archiveNodeType = NodeType::Float; + break; + case clp::ffi::SchemaTreeNode::Type::Bool : + archiveNodeType = NodeType::Boolean; + break; + case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : + archiveNodeType = NodeType::UnstructuredArray; + break; + case clp::ffi::SchemaTreeNode::Type::Str : + //std::cerr << "In str\n"; + if(p.second.value().is()){ + //maybe special case for date string + archiveNodeType = NodeType::VarString; + }else{ + archiveNodeType = NodeType::ClpString; + } + break; + case clp::ffi::SchemaTreeNode::Type::Obj : + //std::cerr << "In obj\n"; + if(p.second.has_value()){ + if(p.second.value().is_null()){ + //std::cout << "Found Null\n"; + archiveNodeType = NodeType::NullValue; + }else{ + archiveNodeType = NodeType::Object; + } + }else{ + archiveNodeType = NodeType::Object; + } + break; + default : + archiveNodeType = NodeType::Unknown; + break; + //Do I need to do anything for structured arrays + } + //std::cerr << "After Switch\n"; + return archiveNodeType; +} + +// +int JsonParser::get_archive_node_id(std::map< std::tuple, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree){ + //std::cerr << "In get archive node id\n"; + std::tuple key (irNodeID, archiveNodeType); + if(cache.find(key) != cache.end()){ + //std::cerr << "Found value\n"; + return cache[key]; + } + auto& currNode = irTree.get_node(irNodeID); + //std::cerr << "Got node\n"; + int parent_node_id; + if(currNode.get_parent_id() == 0){ + //std::cout << "Hit the root\n"; + parent_node_id = 0; + }else{ + //std::cerr << "Look for parent id\n"; + parent_node_id = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree); + //std::cerr << "Got parent id\n"; + } + std::string nodeKey = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value(); + //std::string nodeKey = static_cast(validated_key); + int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey); + //std::cerr << "Added node to archive\n"; + cache[key] = curr_node_archive_id; + //std::cerr << "Added to cache\n"; + return curr_node_archive_id; +} + +void print_kv_log_event(KeyValuePairLogEvent const& kv){ + auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); + std::cout << "number of kv pairs: " << num_kv_pairs << std::endl; + auto const& tree = kv.get_schema_tree(); + for (auto const &pair: kv.get_node_id_value_pairs()){ + auto const& tree_node = tree.get_node(pair.first); + auto const node_type = tree_node.get_type(); + switch(node_type){ + case clp::ffi::SchemaTreeNode::Type::Int : std::cout << "Int" << std::endl; break; + case clp::ffi::SchemaTreeNode::Type::Float : std::cout << "Float" << std::endl; break; + case clp::ffi::SchemaTreeNode::Type::Bool : std::cout << "Bool" << std::endl; break; + case clp::ffi::SchemaTreeNode::Type::Str : std::cout << "Str" << std::endl; break; + case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : std::cout << "UArray" << std::endl; break; + case clp::ffi::SchemaTreeNode::Type::Obj : std::cout << "Obj" << std::endl; break; + default : std::cout << "???" << std::endl; break; + } + + if(!pair.second.has_value()){ + std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n"; + continue; + } + if(pair.second.value().is()){ + std::cout << "{INT:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; + }else if(pair.second.value().is()){ + std::cout << "{FLOAT:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; + }else if(pair.second.value().is()){ + std::cout << "{BOOL:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; + }else if(pair.second.value().is()){ + std::cout << "{STRING:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; + }else if(pair.second.value().is()){ + std::cout << "{EIGHTByte:\t" << pair.first << ": \n"; + auto decoded = pair.second.value().get_immutable_view().decode_and_unparse(); + if(std::nullopt != decoded){ + std:: cout << "\t Decoded & Unparsed: "<< decoded.value()<< std::endl; + }else{ + std::cout << "\tNULL\n"; + } + std::cout << "}\n"; + }else if(pair.second.value().is()){ + std::cout << "{FOURByte:\t" << pair.first << ": \n"; + auto decoded = pair.second.value().get_immutable_view().decode_and_unparse(); + if(std::nullopt != decoded){ + std:: cout << "\tDecoded & Unparsed: "<< decoded.value() << std::endl; + }else{ + std::cout << "\tNULL\n"; + } + std::cout << "}\n"; + }else{ + std::cout << "Unknown Type:\t" << pair.first << "\n"; + } + + } + std::cout << "after for loop\n\n\n"; +} + +void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int>& cache){ + auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); + clp::ffi::SchemaTree const& tree = kv.get_schema_tree(); + //std::cerr << "In parse\n"; + for (auto const& pair: kv.get_node_id_value_pairs()){ + //std::cerr << "In for loop\n"; + clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first); + //std::cerr << "After get node\n"; + NodeType archiveNodeType = get_archive_node_type(tree_node, pair); + //std::cerr << "After get archive node type\n"; + int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree); + //std::cerr << "After get_archive_node_id\n"; + //std::cerr << node_id << std::endl; + switch(archiveNodeType){ + case NodeType::Integer :{ + int64_t i64_value = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, i64_value); + }break; + case NodeType::Float :{ + double d_value = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, d_value); + }break; + case NodeType::Boolean :{ + bool b_value = pair.second.value().get_immutable_view(); + m_current_parsed_message.add_value(node_id, b_value); + }break; + case NodeType::VarString :{ + std::string str = clp::ffi::validate_and_escape_utf8_string(pair.second.value().get_immutable_view()).value(); + m_current_parsed_message.add_value(node_id, str); + }break; + case NodeType::ClpString :{ + //auto const node_type = tree_node.get_type(); + std::string encoded_str; + ///Do I need to reparse these? Do I need to convert 4bytes to 8bytes .... how? + if(pair.second.value().is()){ + std::string decodedValue = pair.second.value().get_immutable_view().decode_and_unparse().value(); + encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value(); + }else{ + std::string decodedValue = pair.second.value().get_immutable_view().decode_and_unparse().value(); + encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value(); + } + m_current_parsed_message.add_value(node_id, encoded_str); + }break; + case NodeType::UnstructuredArray :{ + //auto const encoded_type = tree_node.get_type(); + std::string array_str; + if(pair.second.value().is()){ + array_str = pair.second.value().get_immutable_view().decode_and_unparse().value(); + }else{ + array_str = pair.second.value().get_immutable_view().decode_and_unparse().value(); + } + m_current_parsed_message.add_value(node_id, array_str); + break; + } + default : + //Don't need to add value for obj or null + break; + } + m_current_schema.insert_ordered(node_id); + } + + int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema); + m_current_parsed_message.set_id(current_schema_id); + m_archive_writer->append_message(current_schema_id, m_current_schema, m_current_parsed_message); + return; +} + +bool JsonParser::parse_from_IR() { + std::map, int> id_conversion_cache; + m_archive_writer->add_node(-1, NodeType::Unknown, "root"); + + for (auto& file_path : m_file_paths) { + std::vector ir_buf; + //Make function from reading in this file + char temp_ir_buf[1000]; + //char* new_ir_buf = (char *) malloc(ir_buf.size()); + FileReader infile; + infile.open(file_path); + if(false == infile.is_open()){ + m_archive_writer->close(); + return false; + } + int fsize = std::filesystem::file_size(file_path); + if(0 == fsize){ + m_archive_writer->close(); + return false; + } + ZstdDecompressor zd; + zd.open(infile, fsize); + size_t num_bytes_read = 0; + do{ + num_bytes_read = 0; + zd.try_read(temp_ir_buf, 1000, num_bytes_read); + if (num_bytes_read != 0){ + ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read); + } + }while (num_bytes_read == 1000); + zd.close(); + infile.close(); + /* std::cout << "IR BUFFER\n"; + for (size_t i = 0; i < ir_buf.size(); ++i) { + std::cout << ir_buf.data()[i]; + } + std::cout << "\n\n\n"; */ + BufferReader reader{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; + char const* p; + size_t p_size; + //reader.peek_buffer(p, p_size); + //std::cout << "Num Bytes in buffer left: " << p_size << std::endl; + //for(int z = 0; z < p_size; z++){ + // std::cout << p[z]; + //} + //std::cout << std::endl; + + auto deserializer_result = Deserializer::create(reader); + if(deserializer_result.has_error()){ + m_archive_writer->close(); + return false; + } + auto& deserializer = deserializer_result.value(); + + + m_num_messages = 0; + //size_t bytes_consumed_up_to_prev_archive = 0; + //size_t bytes_consumed_up_to_prev_record = 0; + int iterations = 2; + do{ + iterations--; + //std::cerr << "In do while loop\n"; + auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader); + //std::cerr << "After deserialize\n"; + + //reader.peek_buffer(p, p_size); + //std::cout << "Num Bytes in buffer left: " << p_size << std::endl; + //for(int z = 0; z < p_size; z++){ + // std::cout << p[z]; + //} + //std::cout << std::endl; + + if(kv_log_event_result.has_error()){ + //std::cerr << "has error\n"; + if(kv_log_event_result.error() == std::errc::no_message_available || kv_log_event_result.error() == std::errc::result_out_of_range){ + //std::cerr << "Breaking out of do while loop\n"; + break; + } + } + //std::cerr << "After error check\n"; + m_current_schema.clear(); + auto const& kv_log_event = kv_log_event_result.value(); + + //print_kv_log_event(kv_log_event); + //std::cerr << "before parse\n"; + parse_kv_log_event(kv_log_event, id_conversion_cache); + //std::cerr << "After parse\n"; + m_num_messages++; + //Implement archive splitting and size tracking + /* bytes_consumed_up_to_prev_record = json_file_iterator.get_num_bytes_consumed(); + if (m_archive_writer->get_data_size() >= m_target_encoded_size) { + m_archive_writer->increment_uncompressed_size( + bytes_consumed_up_to_prev_record - bytes_consumed_up_to_prev_archive + ); + bytes_consumed_up_to_prev_archive = bytes_consumed_up_to_prev_record; + split_archive(); + } */ + + m_current_parsed_message.clear(); + + } while(1);//while(iterations > 0); + //std::cout << "Out of do while loop\n"; + + } + return true; +} + +bool JsonParser::parse_to_IR(){ + return true; +} + void JsonParser::store() { m_archive_writer->close(); } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 84aa27fef..90d3bef39 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -22,6 +22,19 @@ #include "TimestampDictionaryWriter.hpp" #include "Utils.hpp" #include "ZstdCompressor.hpp" +#include "../clp/ffi/ir_stream/Deserializer.hpp" +#include "../clp/BufferReader.hpp" +#include "../clp/type_utils.hpp" +#include "../clp/ffi/Value.hpp" +#include "../clp/ffi/KeyValuePairLogEvent.hpp" +#include "../clp/ffi/SchemaTree.hpp" +#include "../clp/ffi/SchemaTreeNode.hpp" +#include "../clp/ffi/utils.hpp" + +using clp::size_checked_pointer_cast; +using clp::BufferReader; +using clp::ffi::ir_stream::Deserializer; +using clp::ffi::KeyValuePairLogEvent; using namespace simdjson; @@ -38,6 +51,14 @@ struct JsonParserOption { std::shared_ptr metadata_db; }; +struct JsonToIRParserOption { + std::vector file_paths; + std::string irs_dir; + size_t max_document_size; + int compression_level; + std::shared_ptr metadata_db; +}; + class JsonParser { public: class OperationFailed : public TraceableException { @@ -50,6 +71,8 @@ class JsonParser { // Constructor explicit JsonParser(JsonParserOption const& option); + JsonParser(JsonToIRParserOption const& option); + // Destructor ~JsonParser() = default; @@ -59,6 +82,18 @@ class JsonParser { */ [[nodiscard]] bool parse(); + /** + * Parses the Key Value IR Stream and stores the data in the archive. + * @return whether the IR Stream was parsed succesfully + */ + [[nodiscard]] bool parse_from_IR(); + + /** + * Parses the JSON log messages to the Key Value IR Stream format. + * @return whether the JSON was parsed succesfully + */ + [[nodiscard]] bool parse_to_IR(); + /** * Writes the metadata and archive data to disk. */ @@ -74,6 +109,22 @@ class JsonParser { */ void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key); + /** + * Parses a Key Value Log Event + * @param kv the key value log event + * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes + */ + void parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int>& cache); + + /** + * Get archive node id for ir node + * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes + * @param irNodeID + * @param irType + * @param irTree + */ + int get_archive_node_id(std::map < std::tuple, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree); + /** * Parses an array within a JSON line * @param line the JSON array diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 0e0401ad1..5a7da79db 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -48,6 +48,14 @@ namespace { */ bool compress(CommandLineArguments const& command_line_arguments); +/** + * Compresses the input IR files specified by the command line arguments into an archive. + * @param command_line_arguments + * @return Whether compression was successful + */ +bool IR_compress(CommandLineArguments const& command_line_arguments); + + /** * Decompresses the archive specified by the given JsonConstructorOption. * @param json_constructor_option @@ -116,6 +124,58 @@ bool compress(CommandLineArguments const& command_line_arguments) { return true; } +bool IR_compress(CommandLineArguments const& command_line_arguments) { + auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); + + // Create output directory in case it doesn't exist + try { + std::filesystem::create_directory(archives_dir.string()); + } catch (std::exception& e) { + SPDLOG_ERROR( + "Failed to create archives directory {} - {}", + archives_dir.string(), + e.what() + ); + return false; + } + + clp_s::JsonParserOption option{}; + option.file_paths = command_line_arguments.get_file_paths(); + option.archives_dir = archives_dir.string(); + option.target_encoded_size = command_line_arguments.get_target_encoded_size(); + //Do I need max_document_size() + option.max_document_size = command_line_arguments.get_max_document_size(); + option.compression_level = command_line_arguments.get_compression_level(); + option.timestamp_key = command_line_arguments.get_timestamp_key(); + option.print_archive_stats = command_line_arguments.print_archive_stats(); + //Is this an option they can make after IR or is that made before and has to be what is in the IR stream already + //option.structurize_arrays = command_line_arguments.get_structurize_arrays(); + + auto const& db_config_container = command_line_arguments.get_metadata_db_config(); + if (db_config_container.has_value()) { + auto const& db_config = db_config_container.value(); + option.metadata_db = std::make_shared( + db_config.get_metadata_db_host(), + db_config.get_metadata_db_port(), + db_config.get_metadata_db_username(), + db_config.get_metadata_db_password(), + db_config.get_metadata_db_name(), + db_config.get_metadata_table_prefix() + ); + } + + clp_s::JsonParser parser(option); + if (false == parser.parse_from_IR()) { + SPDLOG_ERROR("Encountered error while parsing input"); + return false; + }else{ + std::cout << "Got True Back\n"; + } + parser.store(); + std::cout << "stored the archive\n"; + return true; +} + void decompress_archive(clp_s::JsonConstructorOption const& json_constructor_option) { clp_s::JsonConstructor constructor(json_constructor_option); constructor.store(); @@ -263,7 +323,13 @@ int main(int argc, char const* argv[]) { if (false == compress(command_line_arguments)) { return 1; } - } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { + } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) { + if (false == IR_compress(command_line_arguments)) { + return 1; + } + } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) { + return 1; + }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { auto const& archives_dir = command_line_arguments.get_archives_dir(); if (false == std::filesystem::is_directory(archives_dir)) { SPDLOG_ERROR("'{}' is not a directory.", archives_dir); From b631e98c9e4e9cdbae441e132a4149d48d25fe9a Mon Sep 17 00:00:00 2001 From: AVMatthews Date: Fri, 20 Sep 2024 21:32:22 -0400 Subject: [PATCH 02/15] JSON -> IRV2 functionality exposed --- components/core/src/clp_s/CMakeLists.txt | 11 +- .../core/src/clp_s/CommandLineArguments.cpp | 8 +- .../core/src/clp_s/CommandLineArguments.hpp | 4 +- components/core/src/clp_s/JsonParser.cpp | 55 +++---- components/core/src/clp_s/JsonParser.hpp | 9 +- components/core/src/clp_s/clp-s.cpp | 143 +++++++++++++++++- 6 files changed, 173 insertions(+), 57 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 456f53c20..7b684632c 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -30,6 +30,10 @@ set( ../clp/WriterInterface.hpp ../clp/ffi/ir_stream/Deserializer.hpp ../clp/ffi/ir_stream/Deserializer.cpp + ../clp/ffi/ir_stream/Serializer.hpp + ../clp/ffi/ir_stream/Serializer.cpp + ../clp/ffi/ir_stream/utils.hpp + ../clp/ffi/ir_stream/utils.cpp ../clp/BufferReader.hpp ../clp/BufferReader.cpp ../clp/type_utils.hpp @@ -49,9 +53,12 @@ set( ../clp/ffi/SchemaTreeNode.hpp ../clp/ffi/Value.hpp ../clp/ffi/ir_stream/decoding_methods.hpp - ../clp/ffi/ir_stream/protocol_constants.hpp - ../clp/ffi/ir_stream/utils.hpp ../clp/ffi/ir_stream/decoding_methods.cpp + ../clp/ffi/ir_stream/encoding_methods.hpp + ../clp/ffi/ir_stream/encoding_methods.cpp + ../clp/ir/parsing.hpp + ../clp/ir/parsing.cpp + ../clp/ffi/ir_stream/protocol_constants.hpp ../clp/ffi/utils.hpp ../clp/ffi/utils.cpp ../clp/utf8_utils.hpp diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 06c319057..25e858d62 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -432,10 +432,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { default_value(m_max_document_size), "Maximum allowed size (B) for a single document before ir generation fails." )( - "timestamp-key", - po::value(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")-> - default_value(m_timestamp_key), - "Path (e.g. x.y) for the field containing the log event's timestamp." + "encoding-type", + po::value(&m_encoding_type)->value_name("ENCODING_TYPE")-> + default_value(m_encoding_type), + "4 (four byte encoding) or 8 (eight byte encoding)" )( "db-config-file", po::value(&metadata_db_config_file_path)->value_name("FILE")-> diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index 60c480987..dedd3bd59 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -62,6 +62,8 @@ class CommandLineArguments { size_t get_max_document_size() const { return m_max_document_size; } + int get_encoding_type() const { return m_encoding_type; } + [[nodiscard]] bool print_archive_stats() const { return m_print_archive_stats; } std::string const& get_mongodb_uri() const { return m_mongodb_uri; } @@ -179,7 +181,7 @@ class CommandLineArguments { bool m_structurize_arrays{false}; bool m_ordered_decompression{false}; size_t m_ordered_chunk_size{0}; - + int m_encoding_type{8}; // Metadata db variables std::optional m_metadata_db_config; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index a56387bfd..7c9ead88a 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -38,26 +38,6 @@ JsonParser::JsonParser(JsonParserOption const& option) m_archive_writer->open(m_archive_options); } -JsonParser::JsonParser(JsonToIRParserOption const& option) - : m_num_messages(0), - m_max_document_size(option.max_document_size) { - if (false == FileUtils::validate_path(option.file_paths)) { - exit(1); - } - - for (auto& file_path : option.file_paths) { - FileUtils::find_all_files(file_path, m_file_paths); - } - - m_archive_options.archives_dir = option.irs_dir; - m_archive_options.compression_level = option.compression_level; - //m_archive_options.print_archive_stats = option.print_archive_stats; - m_archive_options.id = m_generator(); - - m_archive_writer = std::make_unique(option.metadata_db); - m_archive_writer->open(m_archive_options); -} - void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_id) { ondemand::object_iterator it = line.begin(); if (it == line.end()) { @@ -744,11 +724,12 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int> id_conversion_cache; m_archive_writer->add_node(-1, NodeType::Unknown, "root"); - + //int fileNum = 0; for (auto& file_path : m_file_paths) { + std::cout << file_path << std::endl; std::vector ir_buf; //Make function from reading in this file - char temp_ir_buf[1000]; + char temp_ir_buf[10000]; //char* new_ir_buf = (char *) malloc(ir_buf.size()); FileReader infile; infile.open(file_path); @@ -766,11 +747,11 @@ bool JsonParser::parse_from_IR() { size_t num_bytes_read = 0; do{ num_bytes_read = 0; - zd.try_read(temp_ir_buf, 1000, num_bytes_read); + zd.try_read(temp_ir_buf, 10000, num_bytes_read); if (num_bytes_read != 0){ ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read); } - }while (num_bytes_read == 1000); + }while (num_bytes_read == 10000); zd.close(); infile.close(); /* std::cout << "IR BUFFER\n"; @@ -799,9 +780,9 @@ bool JsonParser::parse_from_IR() { m_num_messages = 0; //size_t bytes_consumed_up_to_prev_archive = 0; //size_t bytes_consumed_up_to_prev_record = 0; - int iterations = 2; + //int iterations = 2; do{ - iterations--; + //iterations--; //std::cerr << "In do while loop\n"; auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader); //std::cerr << "After deserialize\n"; @@ -825,33 +806,31 @@ bool JsonParser::parse_from_IR() { auto const& kv_log_event = kv_log_event_result.value(); //print_kv_log_event(kv_log_event); - //std::cerr << "before parse\n"; + /*if (fileNum > 0){ + std::cout << "before parse\n"; + print_kv_log_event(kv_log_event); + }*/ parse_kv_log_event(kv_log_event, id_conversion_cache); //std::cerr << "After parse\n"; m_num_messages++; //Implement archive splitting and size tracking - /* bytes_consumed_up_to_prev_record = json_file_iterator.get_num_bytes_consumed(); if (m_archive_writer->get_data_size() >= m_target_encoded_size) { - m_archive_writer->increment_uncompressed_size( - bytes_consumed_up_to_prev_record - bytes_consumed_up_to_prev_archive - ); - bytes_consumed_up_to_prev_archive = bytes_consumed_up_to_prev_record; + std::cerr << "Splitting Archive\n\n"; + id_conversion_cache.clear(); + m_archive_writer->add_node(-1, NodeType::Unknown, "root"); split_archive(); - } */ + } m_current_parsed_message.clear(); } while(1);//while(iterations > 0); //std::cout << "Out of do while loop\n"; - + id_conversion_cache.clear(); + //fileNum++; } return true; } -bool JsonParser::parse_to_IR(){ - return true; -} - void JsonParser::store() { m_archive_writer->close(); } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 90d3bef39..6927a178d 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -30,6 +30,7 @@ #include "../clp/ffi/SchemaTree.hpp" #include "../clp/ffi/SchemaTreeNode.hpp" #include "../clp/ffi/utils.hpp" +#include "../clp/ir/types.hpp" using clp::size_checked_pointer_cast; using clp::BufferReader; @@ -56,7 +57,7 @@ struct JsonToIRParserOption { std::string irs_dir; size_t max_document_size; int compression_level; - std::shared_ptr metadata_db; + int encoding; }; class JsonParser { @@ -88,12 +89,6 @@ class JsonParser { */ [[nodiscard]] bool parse_from_IR(); - /** - * Parses the JSON log messages to the Key Value IR Stream format. - * @return whether the JSON was parsed succesfully - */ - [[nodiscard]] bool parse_to_IR(); - /** * Writes the metadata and archive data to disk. */ diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 5a7da79db..f4c6a5b95 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -33,12 +34,16 @@ #include "TimestampPattern.hpp" #include "TraceableException.hpp" #include "Utils.hpp" +#include "FileWriter.hpp" +#include "ZstdCompressor.hpp" +#include "../clp/ffi/ir_stream/Serializer.hpp" using namespace clp_s::search; using clp_s::cArchiveFormatDevelopmentVersionFlag; using clp_s::cEpochTimeMax; using clp_s::cEpochTimeMin; using clp_s::CommandLineArguments; +using clp::ffi::ir_stream::Serializer; namespace { /** @@ -55,7 +60,6 @@ bool compress(CommandLineArguments const& command_line_arguments); */ bool IR_compress(CommandLineArguments const& command_line_arguments); - /** * Decompresses the archive specified by the given JsonConstructorOption. * @param json_constructor_option @@ -124,6 +128,136 @@ bool compress(CommandLineArguments const& command_line_arguments) { return true; } +template +auto flush_and_clear_serializer_buffer( + Serializer& serializer, + std::vector& byte_buf +) -> void { + auto const view{serializer.get_ir_buf_view()}; + byte_buf.insert(byte_buf.cend(), view.begin(), view.end()); + serializer.clear_ir_buf(); +} + +template +auto unpack_and_serialize_msgpack_bytes( + std::vector const& msgpack_bytes, + Serializer& serializer +) -> bool { + auto const msgpack_obj_handle{msgpack::unpack( + clp::size_checked_pointer_cast(msgpack_bytes.data()), + msgpack_bytes.size() + )}; + auto const msgpack_obj{msgpack_obj_handle.get()}; + if (msgpack::type::MAP != msgpack_obj.type) { + return false; + } + return serializer.serialize_msgpack_map(msgpack_obj.via.map); +} + +template +auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){ + //std::cout << "Running Serializer\n"; + auto result{Serializer::create()}; + if (result.has_error()){ + SPDLOG_ERROR("Failed to create Serializer"); + return false; + } + auto& serializer{result.value()}; + std::vector ir_buf; + flush_and_clear_serializer_buffer(serializer, ir_buf); + + std::ifstream inFile; + inFile.open(path, std::ifstream::in); + //std::cout << "Opened Input file\n"; + + std::string outPath = ""; + int index = path.find_last_of('/'); + if(std::string::npos == index){ + outPath = option.irs_dir + "/" + path + ".ir"; + }else{ + outPath = option.irs_dir + "/" + path.substr(index, path.length()-index) + ".ir"; + } + clp_s::FileWriter outFile; + //std::cout << outPath << "\n"; + outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting); + clp_s::ZstdCompressor zc; + zc.open(outFile, option.compression_level); + + std::string line; + size_t totalSize = 0; + + if (inFile.is_open()) { + while (getline(inFile, line)) { + auto j_obj = nlohmann::json::parse(line); + unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer); + flush_and_clear_serializer_buffer(serializer, ir_buf); + if(ir_buf.size() >= 1000000000){ + totalSize = totalSize + ir_buf.size(); + zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); + zc.flush(); + ir_buf.clear(); + } + } + totalSize = totalSize + ir_buf.size(); + zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); + zc.flush(); + ir_buf.clear(); + inFile.close(); + zc.close(); + outFile.close(); + } + + return true; +} + +bool generate_IR(CommandLineArguments const& command_line_arguments){ + auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); + + // Create output directory in case it doesn't exist + try { + std::filesystem::create_directory(irs_dir.string()); + } catch (std::exception& e) { + SPDLOG_ERROR( + "Failed to create archives directory {} - {}", + irs_dir.string(), + e.what() + ); + return false; + } + clp_s::JsonToIRParserOption option{}; + option.file_paths = command_line_arguments.get_file_paths(); + option.irs_dir = irs_dir.string(); + //std::cout << "IRs dir: " << option.irs_dir << std::endl; + option.max_document_size = command_line_arguments.get_max_document_size(); + option.compression_level = command_line_arguments.get_compression_level(); + option.encoding = command_line_arguments.get_encoding_type(); + //std::cout << "encoding type: " << static_cast(option.encoding) << std::endl; + + if (false == clp_s::FileUtils::validate_path(option.file_paths)) { + exit(1); + } + + std::vector all_file_paths; + for (auto& file_path : option.file_paths) { + clp_s::FileUtils::find_all_files(file_path, all_file_paths); + } + + for (auto& path : all_file_paths) { + bool success; + if (option.encoding == 4){ + //std::cout << "four byte\n"; + success = run_serializer(option, path); + }else{ + //std::cout << "eight byte\n"; + success = run_serializer(option, path); + } + if (false == success){ + return false; + } + } + return true; +} + bool IR_compress(CommandLineArguments const& command_line_arguments) { auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); @@ -168,11 +302,8 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) { if (false == parser.parse_from_IR()) { SPDLOG_ERROR("Encountered error while parsing input"); return false; - }else{ - std::cout << "Got True Back\n"; } parser.store(); - std::cout << "stored the archive\n"; return true; } @@ -328,7 +459,9 @@ int main(int argc, char const* argv[]) { return 1; } } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) { - return 1; + if (false == generate_IR(command_line_arguments)) { + return 1; + } }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { auto const& archives_dir = command_line_arguments.get_archives_dir(); if (false == std::filesystem::is_directory(archives_dir)) { From 83084c004cdde44ae13da2174347681652c9e315 Mon Sep 17 00:00:00 2001 From: AVMatthews Date: Sat, 21 Sep 2024 06:27:45 -0400 Subject: [PATCH 03/15] Linting and some code clean up --- .../core/src/clp_s/CommandLineArguments.cpp | 28 +- components/core/src/clp_s/JsonParser.cpp | 393 +++++++++--------- components/core/src/clp_s/JsonParser.hpp | 38 +- components/core/src/clp_s/clp-s.cpp | 87 ++-- 4 files changed, 275 insertions(+), 271 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 25e858d62..0435cd3f6 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -324,11 +324,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "print-archive-stats", po::bool_switch(&m_print_archive_stats), "Print statistics (json) about the archive after it's compressed." - )/*( - "structurize-arrays", - po::bool_switch(&m_structurize_arrays), - "Structurize arrays instead of compressing them as clp strings." - ) */; + ); // clang-format on po::positional_options_description positional_options; @@ -354,10 +350,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { if (parsed_command_line_options.count("help")) { print_IR_compression_usage(); - /* std::cerr << "Examples:" << std::endl; - std::cerr << " # Compress file1.json and dir1 into archives-dir" << std::endl; - std::cerr << " " << m_program_name << " c archives-dir file1.json dir1" - << std::endl; */ + std::cerr << "Examples:" << std::endl; + std::cerr << " # Compress file1.ir and dir1 into archives-dir" << std::endl; + std::cerr << " " << m_program_name << " i archives-dir file1.ir dir1" + << std::endl; po::options_description visible_options; visible_options.add(general_options); @@ -403,7 +399,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { m_metadata_db_config = std::move(metadata_db_config); } - }else if ((char)Command::Json_To_IR == command_input) { + } else if ((char)Command::Json_To_IR == command_input) { po::options_description compression_positional_options; // clang-format off compression_positional_options.add_options()( @@ -473,10 +469,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { if (parsed_command_line_options.count("help")) { print_json_to_IR_usage(); - /* std::cerr << "Examples:" << std::endl; - std::cerr << " # Compress file1.json and dir1 into archives-dir" << std::endl; - std::cerr << " " << m_program_name << " c archives-dir file1.json dir1" - << std::endl; */ + std::cerr << "Examples:" << std::endl; + std::cerr << " # Parse file1.json and dir1 into irs-dir" << std::endl; + std::cerr << " " << m_program_name << " r irs-dir file1.json dir1" + << std::endl; po::options_description visible_options; visible_options.add(general_options); @@ -522,7 +518,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { m_metadata_db_config = std::move(metadata_db_config); } - }else if ((char)Command::Extract == command_input) { + } else if ((char)Command::Extract == command_input) { po::options_description extraction_options; // clang-format off extraction_options.add_options()( @@ -1046,7 +1042,7 @@ void CommandLineArguments::print_search_usage() const { } void CommandLineArguments::print_json_to_IR_usage() const { - std::cerr << "Usage: " << m_program_name << " r [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" + std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl; } diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 7c9ead88a..6647d3d7a 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,7 +1,7 @@ #include "JsonParser.hpp" -#include #include +#include #include #include @@ -521,199 +521,233 @@ bool JsonParser::parse() { return true; } -NodeType get_archive_node_type(clp::ffi::SchemaTreeNode const& node, std::pair> p){ - //std::cerr << "In get_archive_node_type\n"; +NodeType get_archive_node_type( + clp::ffi::SchemaTreeNode const& node, + std::pair> p +) { auto const node_type = node.get_type(); - //std::cerr << "got ir type\n"; - //figure out what type the node is in archive node type + // figure out what type the node is in archive node type NodeType archiveNodeType; - switch(node_type){ - case clp::ffi::SchemaTreeNode::Type::Int : - archiveNodeType = NodeType::Integer; - break; - case clp::ffi::SchemaTreeNode::Type::Float : - archiveNodeType = NodeType::Float; - break; - case clp::ffi::SchemaTreeNode::Type::Bool : - archiveNodeType = NodeType::Boolean; - break; - case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : - archiveNodeType = NodeType::UnstructuredArray; - break; - case clp::ffi::SchemaTreeNode::Type::Str : - //std::cerr << "In str\n"; - if(p.second.value().is()){ - //maybe special case for date string - archiveNodeType = NodeType::VarString; - }else{ - archiveNodeType = NodeType::ClpString; - } - break; - case clp::ffi::SchemaTreeNode::Type::Obj : - //std::cerr << "In obj\n"; - if(p.second.has_value()){ - if(p.second.value().is_null()){ - //std::cout << "Found Null\n"; - archiveNodeType = NodeType::NullValue; - }else{ - archiveNodeType = NodeType::Object; - } - }else{ + switch (node_type) { + case clp::ffi::SchemaTreeNode::Type::Int: + archiveNodeType = NodeType::Integer; + break; + case clp::ffi::SchemaTreeNode::Type::Float: + archiveNodeType = NodeType::Float; + break; + case clp::ffi::SchemaTreeNode::Type::Bool: + archiveNodeType = NodeType::Boolean; + break; + case clp::ffi::SchemaTreeNode::Type::UnstructuredArray: + archiveNodeType = NodeType::UnstructuredArray; + break; + case clp::ffi::SchemaTreeNode::Type::Str: + // std::cerr << "In str\n"; + if (p.second.value().is()) { + // maybe special case for date string + archiveNodeType = NodeType::VarString; + } else { + archiveNodeType = NodeType::ClpString; + } + break; + case clp::ffi::SchemaTreeNode::Type::Obj: + // std::cerr << "In obj\n"; + if (p.second.has_value()) { + if (p.second.value().is_null()) { + // std::cout << "Found Null\n"; + archiveNodeType = NodeType::NullValue; + } else { archiveNodeType = NodeType::Object; } - break; - default : - archiveNodeType = NodeType::Unknown; - break; - //Do I need to do anything for structured arrays + } else { + archiveNodeType = NodeType::Object; + } + break; + default: + archiveNodeType = NodeType::Unknown; + break; } - //std::cerr << "After Switch\n"; return archiveNodeType; } // -int JsonParser::get_archive_node_id(std::map< std::tuple, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree){ - //std::cerr << "In get archive node id\n"; - std::tuple key (irNodeID, archiveNodeType); - if(cache.find(key) != cache.end()){ - //std::cerr << "Found value\n"; +int JsonParser::get_archive_node_id( + std::map, int>& cache, + int irNodeID, + NodeType archiveNodeType, + clp::ffi::SchemaTree const& irTree +) { + std::tuple key(irNodeID, archiveNodeType); + if (cache.find(key) != cache.end()) { return cache[key]; } - auto& currNode = irTree.get_node(irNodeID); - //std::cerr << "Got node\n"; + auto& currNode = irTree.get_node(irNodeID); int parent_node_id; - if(currNode.get_parent_id() == 0){ - //std::cout << "Hit the root\n"; + // Found the root + if (currNode.get_parent_id() == 0) { parent_node_id = 0; - }else{ - //std::cerr << "Look for parent id\n"; - parent_node_id = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree); - //std::cerr << "Got parent id\n"; + } else { + parent_node_id + = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree); } - std::string nodeKey = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value(); - //std::string nodeKey = static_cast(validated_key); + std::string nodeKey + = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value(); int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey); - //std::cerr << "Added node to archive\n"; cache[key] = curr_node_archive_id; - //std::cerr << "Added to cache\n"; return curr_node_archive_id; } -void print_kv_log_event(KeyValuePairLogEvent const& kv){ +void print_kv_log_event(KeyValuePairLogEvent const& kv) { auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); std::cout << "number of kv pairs: " << num_kv_pairs << std::endl; auto const& tree = kv.get_schema_tree(); - for (auto const &pair: kv.get_node_id_value_pairs()){ + for (auto const& pair : kv.get_node_id_value_pairs()) { auto const& tree_node = tree.get_node(pair.first); auto const node_type = tree_node.get_type(); - switch(node_type){ - case clp::ffi::SchemaTreeNode::Type::Int : std::cout << "Int" << std::endl; break; - case clp::ffi::SchemaTreeNode::Type::Float : std::cout << "Float" << std::endl; break; - case clp::ffi::SchemaTreeNode::Type::Bool : std::cout << "Bool" << std::endl; break; - case clp::ffi::SchemaTreeNode::Type::Str : std::cout << "Str" << std::endl; break; - case clp::ffi::SchemaTreeNode::Type::UnstructuredArray : std::cout << "UArray" << std::endl; break; - case clp::ffi::SchemaTreeNode::Type::Obj : std::cout << "Obj" << std::endl; break; - default : std::cout << "???" << std::endl; break; + switch (node_type) { + case clp::ffi::SchemaTreeNode::Type::Int: + std::cout << "Int" << std::endl; + break; + case clp::ffi::SchemaTreeNode::Type::Float: + std::cout << "Float" << std::endl; + break; + case clp::ffi::SchemaTreeNode::Type::Bool: + std::cout << "Bool" << std::endl; + break; + case clp::ffi::SchemaTreeNode::Type::Str: + std::cout << "Str" << std::endl; + break; + case clp::ffi::SchemaTreeNode::Type::UnstructuredArray: + std::cout << "UArray" << std::endl; + break; + case clp::ffi::SchemaTreeNode::Type::Obj: + std::cout << "Obj" << std::endl; + break; + default: + std::cout << "???" << std::endl; + break; } - - if(!pair.second.has_value()){ - std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n"; - continue; + + if (!pair.second.has_value()) { + std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n"; + continue; } - if(pair.second.value().is()){ - std::cout << "{INT:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; - }else if(pair.second.value().is()){ - std::cout << "{FLOAT:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; - }else if(pair.second.value().is()){ - std::cout << "{BOOL:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; - }else if(pair.second.value().is()){ - std::cout << "{STRING:\t" << pair.first << ": " << pair.second.value().get_immutable_view() << "}\n"; - }else if(pair.second.value().is()){ - std::cout << "{EIGHTByte:\t" << pair.first << ": \n"; - auto decoded = pair.second.value().get_immutable_view().decode_and_unparse(); - if(std::nullopt != decoded){ - std:: cout << "\t Decoded & Unparsed: "<< decoded.value()<< std::endl; - }else{ - std::cout << "\tNULL\n"; - } - std::cout << "}\n"; - }else if(pair.second.value().is()){ - std::cout << "{FOURByte:\t" << pair.first << ": \n"; - auto decoded = pair.second.value().get_immutable_view().decode_and_unparse(); - if(std::nullopt != decoded){ - std:: cout << "\tDecoded & Unparsed: "<< decoded.value() << std::endl; - }else{ - std::cout << "\tNULL\n"; - } - std::cout << "}\n"; - }else{ - std::cout << "Unknown Type:\t" << pair.first << "\n"; + if (pair.second.value().is()) { + std::cout << "{INT:\t" << pair.first << ": " + << pair.second.value().get_immutable_view() << "}\n"; + } else if (pair.second.value().is()) { + std::cout << "{FLOAT:\t" << pair.first << ": " + << pair.second.value().get_immutable_view() << "}\n"; + } else if (pair.second.value().is()) { + std::cout << "{BOOL:\t" << pair.first << ": " + << pair.second.value().get_immutable_view() << "}\n"; + } else if (pair.second.value().is()) { + std::cout << "{STRING:\t" << pair.first << ": " + << pair.second.value().get_immutable_view() << "}\n"; + } else if (pair.second.value().is()) { + std::cout << "{EIGHTByte:\t" << pair.first << ": \n"; + auto decoded = pair.second.value() + .get_immutable_view() + .decode_and_unparse(); + if (std::nullopt != decoded) { + std::cout << "\t Decoded & Unparsed: " << decoded.value() << std::endl; + } else { + std::cout << "\tNULL\n"; + } + std::cout << "}\n"; + } else if (pair.second.value().is()) { + std::cout << "{FOURByte:\t" << pair.first << ": \n"; + auto decoded = pair.second.value() + .get_immutable_view() + .decode_and_unparse(); + if (std::nullopt != decoded) { + std::cout << "\tDecoded & Unparsed: " << decoded.value() << std::endl; + } else { + std::cout << "\tNULL\n"; + } + std::cout << "}\n"; + } else { + std::cout << "Unknown Type:\t" << pair.first << "\n"; } - } std::cout << "after for loop\n\n\n"; } -void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int>& cache){ +void JsonParser::parse_kv_log_event( + KeyValuePairLogEvent const& kv, + std::map, int>& cache +) { auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); clp::ffi::SchemaTree const& tree = kv.get_schema_tree(); - //std::cerr << "In parse\n"; - for (auto const& pair: kv.get_node_id_value_pairs()){ - //std::cerr << "In for loop\n"; + + for (auto const& pair : kv.get_node_id_value_pairs()) { clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first); - //std::cerr << "After get node\n"; NodeType archiveNodeType = get_archive_node_type(tree_node, pair); - //std::cerr << "After get archive node type\n"; int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree); - //std::cerr << "After get_archive_node_id\n"; - //std::cerr << node_id << std::endl; - switch(archiveNodeType){ - case NodeType::Integer :{ + + switch (archiveNodeType) { + case NodeType::Integer: { int64_t i64_value = pair.second.value().get_immutable_view(); m_current_parsed_message.add_value(node_id, i64_value); - }break; - case NodeType::Float :{ + } break; + case NodeType::Float: { double d_value = pair.second.value().get_immutable_view(); m_current_parsed_message.add_value(node_id, d_value); - }break; - case NodeType::Boolean :{ + } break; + case NodeType::Boolean: { bool b_value = pair.second.value().get_immutable_view(); m_current_parsed_message.add_value(node_id, b_value); - }break; - case NodeType::VarString :{ - std::string str = clp::ffi::validate_and_escape_utf8_string(pair.second.value().get_immutable_view()).value(); + } break; + case NodeType::VarString: { + std::string str = clp::ffi::validate_and_escape_utf8_string( + pair.second.value().get_immutable_view() + ) + .value(); m_current_parsed_message.add_value(node_id, str); - }break; - case NodeType::ClpString :{ - //auto const node_type = tree_node.get_type(); + } break; + case NodeType::ClpString: { std::string encoded_str; - ///Do I need to reparse these? Do I need to convert 4bytes to 8bytes .... how? - if(pair.second.value().is()){ - std::string decodedValue = pair.second.value().get_immutable_view().decode_and_unparse().value(); - encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value(); - }else{ - std::string decodedValue = pair.second.value().get_immutable_view().decode_and_unparse().value(); - encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()).value(); + if (pair.second.value().is()) { + std::string decodedValue + = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()) + .value(); + } else { + std::string decodedValue + = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()) + .value(); } m_current_parsed_message.add_value(node_id, encoded_str); - }break; - case NodeType::UnstructuredArray :{ - //auto const encoded_type = tree_node.get_type(); + } break; + case NodeType::UnstructuredArray: { std::string array_str; - if(pair.second.value().is()){ - array_str = pair.second.value().get_immutable_view().decode_and_unparse().value(); - }else{ - array_str = pair.second.value().get_immutable_view().decode_and_unparse().value(); + if (pair.second.value().is()) { + array_str = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } else { + array_str = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); } m_current_parsed_message.add_value(node_id, array_str); break; } - default : - //Don't need to add value for obj or null + default: + // Don't need to add value for obj or null break; } m_current_schema.insert_ordered(node_id); - } + } int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema); m_current_parsed_message.set_id(current_schema_id); @@ -722,98 +756,67 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int> id_conversion_cache; + std::map, int> id_conversion_cache; m_archive_writer->add_node(-1, NodeType::Unknown, "root"); - //int fileNum = 0; + for (auto& file_path : m_file_paths) { std::cout << file_path << std::endl; std::vector ir_buf; - //Make function from reading in this file - char temp_ir_buf[10000]; - //char* new_ir_buf = (char *) malloc(ir_buf.size()); + char temp_ir_buf[10'000]; FileReader infile; infile.open(file_path); - if(false == infile.is_open()){ + if (false == infile.is_open()) { m_archive_writer->close(); return false; } int fsize = std::filesystem::file_size(file_path); - if(0 == fsize){ + if (0 == fsize) { m_archive_writer->close(); return false; } ZstdDecompressor zd; zd.open(infile, fsize); size_t num_bytes_read = 0; - do{ + do { num_bytes_read = 0; - zd.try_read(temp_ir_buf, 10000, num_bytes_read); - if (num_bytes_read != 0){ - ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf+num_bytes_read); + zd.try_read(temp_ir_buf, 10'000, num_bytes_read); + if (num_bytes_read != 0) { + ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf + num_bytes_read); } - }while (num_bytes_read == 10000); + } while (num_bytes_read == 10'000); zd.close(); - infile.close(); - /* std::cout << "IR BUFFER\n"; - for (size_t i = 0; i < ir_buf.size(); ++i) { - std::cout << ir_buf.data()[i]; - } - std::cout << "\n\n\n"; */ + infile.close(); + BufferReader reader{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; char const* p; size_t p_size; - //reader.peek_buffer(p, p_size); - //std::cout << "Num Bytes in buffer left: " << p_size << std::endl; - //for(int z = 0; z < p_size; z++){ - // std::cout << p[z]; - //} - //std::cout << std::endl; auto deserializer_result = Deserializer::create(reader); - if(deserializer_result.has_error()){ + if (deserializer_result.has_error()) { m_archive_writer->close(); return false; } auto& deserializer = deserializer_result.value(); - m_num_messages = 0; - //size_t bytes_consumed_up_to_prev_archive = 0; - //size_t bytes_consumed_up_to_prev_record = 0; - //int iterations = 2; - do{ - //iterations--; - //std::cerr << "In do while loop\n"; + do { auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader); - //std::cerr << "After deserialize\n"; - - //reader.peek_buffer(p, p_size); - //std::cout << "Num Bytes in buffer left: " << p_size << std::endl; - //for(int z = 0; z < p_size; z++){ - // std::cout << p[z]; - //} - //std::cout << std::endl; - - if(kv_log_event_result.has_error()){ - //std::cerr << "has error\n"; - if(kv_log_event_result.error() == std::errc::no_message_available || kv_log_event_result.error() == std::errc::result_out_of_range){ - //std::cerr << "Breaking out of do while loop\n"; + + if (kv_log_event_result.has_error()) { + if (kv_log_event_result.error() == std::errc::no_message_available + || kv_log_event_result.error() == std::errc::result_out_of_range) + { break; } } - //std::cerr << "After error check\n"; + m_current_schema.clear(); auto const& kv_log_event = kv_log_event_result.value(); - //print_kv_log_event(kv_log_event); - /*if (fileNum > 0){ - std::cout << "before parse\n"; - print_kv_log_event(kv_log_event); - }*/ + // print_kv_log_event(kv_log_event); parse_kv_log_event(kv_log_event, id_conversion_cache); - //std::cerr << "After parse\n"; + m_num_messages++; - //Implement archive splitting and size tracking if (m_archive_writer->get_data_size() >= m_target_encoded_size) { std::cerr << "Splitting Archive\n\n"; id_conversion_cache.clear(); @@ -823,10 +826,8 @@ bool JsonParser::parse_from_IR() { m_current_parsed_message.clear(); - } while(1);//while(iterations > 0); - //std::cout << "Out of do while loop\n"; + } while (1); id_conversion_cache.clear(); - //fileNum++; } return true; } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 6927a178d..1e1aa96fa 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -9,7 +9,16 @@ #include #include +#include "../clp/BufferReader.hpp" +#include "../clp/ffi/ir_stream/Deserializer.hpp" +#include "../clp/ffi/KeyValuePairLogEvent.hpp" +#include "../clp/ffi/SchemaTree.hpp" +#include "../clp/ffi/SchemaTreeNode.hpp" +#include "../clp/ffi/utils.hpp" +#include "../clp/ffi/Value.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" +#include "../clp/ir/types.hpp" +#include "../clp/type_utils.hpp" #include "ArchiveWriter.hpp" #include "DictionaryWriter.hpp" #include "FileReader.hpp" @@ -22,20 +31,11 @@ #include "TimestampDictionaryWriter.hpp" #include "Utils.hpp" #include "ZstdCompressor.hpp" -#include "../clp/ffi/ir_stream/Deserializer.hpp" -#include "../clp/BufferReader.hpp" -#include "../clp/type_utils.hpp" -#include "../clp/ffi/Value.hpp" -#include "../clp/ffi/KeyValuePairLogEvent.hpp" -#include "../clp/ffi/SchemaTree.hpp" -#include "../clp/ffi/SchemaTreeNode.hpp" -#include "../clp/ffi/utils.hpp" -#include "../clp/ir/types.hpp" -using clp::size_checked_pointer_cast; using clp::BufferReader; using clp::ffi::ir_stream::Deserializer; using clp::ffi::KeyValuePairLogEvent; +using clp::size_checked_pointer_cast; using namespace simdjson; @@ -107,18 +107,28 @@ class JsonParser { /** * Parses a Key Value Log Event * @param kv the key value log event - * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes + * @param cache cache of node id conversions between deserializer schema tree nodes and archive + * schema tree nodes */ - void parse_kv_log_event(KeyValuePairLogEvent const& kv, std::map, int>& cache); + void parse_kv_log_event( + KeyValuePairLogEvent const& kv, + std::map, int>& cache + ); /** * Get archive node id for ir node - * @param cache cache of node id conversions between deserializer schema tree nodes and archive schema tree nodes + * @param cache cache of node id conversions between deserializer schema tree nodes and archive + * schema tree nodes * @param irNodeID * @param irType * @param irTree */ - int get_archive_node_id(std::map < std::tuple, int>& cache, int irNodeID, NodeType archiveNodeType, clp::ffi::SchemaTree const& irTree); + int get_archive_node_id( + std::map, int>& cache, + int irNodeID, + NodeType archiveNodeType, + clp::ffi::SchemaTree const& irTree + ); /** * Parses an array within a JSON line diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index f4c6a5b95..911e92c1d 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -1,22 +1,24 @@ #include #include +#include #include #include #include #include #include -#include #include #include #include #include +#include "../clp/ffi/ir_stream/Serializer.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" #include "../clp/streaming_archive/ArchiveMetadata.hpp" #include "../reducer/network_utils.hpp" #include "CommandLineArguments.hpp" #include "Defs.hpp" +#include "FileWriter.hpp" #include "JsonConstructor.hpp" #include "JsonParser.hpp" #include "ReaderUtils.hpp" @@ -34,16 +36,14 @@ #include "TimestampPattern.hpp" #include "TraceableException.hpp" #include "Utils.hpp" -#include "FileWriter.hpp" #include "ZstdCompressor.hpp" -#include "../clp/ffi/ir_stream/Serializer.hpp" using namespace clp_s::search; +using clp::ffi::ir_stream::Serializer; using clp_s::cArchiveFormatDevelopmentVersionFlag; using clp_s::cEpochTimeMax; using clp_s::cEpochTimeMin; using clp_s::CommandLineArguments; -using clp::ffi::ir_stream::Serializer; namespace { /** @@ -155,10 +155,10 @@ auto unpack_and_serialize_msgpack_bytes( } template -auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){ - //std::cout << "Running Serializer\n"; +auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { + // std::cout << "Running Serializer\n"; auto result{Serializer::create()}; - if (result.has_error()){ + if (result.has_error()) { SPDLOG_ERROR("Failed to create Serializer"); return false; } @@ -167,71 +167,67 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path){ flush_and_clear_serializer_buffer(serializer, ir_buf); std::ifstream inFile; - inFile.open(path, std::ifstream::in); - //std::cout << "Opened Input file\n"; + inFile.open(path, std::ifstream::in); + // std::cout << "Opened Input file\n"; std::string outPath = ""; int index = path.find_last_of('/'); - if(std::string::npos == index){ + if (std::string::npos == index) { outPath = option.irs_dir + "/" + path + ".ir"; - }else{ - outPath = option.irs_dir + "/" + path.substr(index, path.length()-index) + ".ir"; + } else { + outPath = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir"; } clp_s::FileWriter outFile; - //std::cout << outPath << "\n"; + // std::cout << outPath << "\n"; outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting); clp_s::ZstdCompressor zc; zc.open(outFile, option.compression_level); - std::string line; + std::string line; size_t totalSize = 0; - if (inFile.is_open()) { - while (getline(inFile, line)) { - auto j_obj = nlohmann::json::parse(line); - unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer); - flush_and_clear_serializer_buffer(serializer, ir_buf); - if(ir_buf.size() >= 1000000000){ - totalSize = totalSize + ir_buf.size(); - zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); - zc.flush(); - ir_buf.clear(); - } + if (inFile.is_open()) { + while (getline(inFile, line)) { + auto j_obj = nlohmann::json::parse(line); + unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer); + flush_and_clear_serializer_buffer(serializer, ir_buf); + if (ir_buf.size() >= 1'000'000'000) { + totalSize = totalSize + ir_buf.size(); + zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); + zc.flush(); + ir_buf.clear(); + } } - totalSize = totalSize + ir_buf.size(); + totalSize = totalSize + ir_buf.size(); zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); zc.flush(); ir_buf.clear(); - inFile.close(); + inFile.close(); zc.close(); outFile.close(); - } + } return true; } -bool generate_IR(CommandLineArguments const& command_line_arguments){ +bool generate_IR(CommandLineArguments const& command_line_arguments) { auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); // Create output directory in case it doesn't exist try { std::filesystem::create_directory(irs_dir.string()); } catch (std::exception& e) { - SPDLOG_ERROR( - "Failed to create archives directory {} - {}", - irs_dir.string(), - e.what() - ); + SPDLOG_ERROR("Failed to create archives directory {} - {}", irs_dir.string(), e.what()); return false; } clp_s::JsonToIRParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); option.irs_dir = irs_dir.string(); - //std::cout << "IRs dir: " << option.irs_dir << std::endl; + // std::cout << "IRs dir: " << option.irs_dir << std::endl; option.max_document_size = command_line_arguments.get_max_document_size(); option.compression_level = command_line_arguments.get_compression_level(); option.encoding = command_line_arguments.get_encoding_type(); - //std::cout << "encoding type: " << static_cast(option.encoding) << std::endl; + // std::cout << "encoding type: " << static_cast(option.encoding) << std::endl; if (false == clp_s::FileUtils::validate_path(option.file_paths)) { exit(1); @@ -244,14 +240,14 @@ bool generate_IR(CommandLineArguments const& command_line_arguments){ for (auto& path : all_file_paths) { bool success; - if (option.encoding == 4){ - //std::cout << "four byte\n"; + if (option.encoding == 4) { + // std::cout << "four byte\n"; success = run_serializer(option, path); - }else{ - //std::cout << "eight byte\n"; + } else { + // std::cout << "eight byte\n"; success = run_serializer(option, path); } - if (false == success){ + if (false == success) { return false; } } @@ -277,13 +273,14 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) { option.file_paths = command_line_arguments.get_file_paths(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); - //Do I need max_document_size() + // Do I need max_document_size() option.max_document_size = command_line_arguments.get_max_document_size(); option.compression_level = command_line_arguments.get_compression_level(); option.timestamp_key = command_line_arguments.get_timestamp_key(); option.print_archive_stats = command_line_arguments.print_archive_stats(); - //Is this an option they can make after IR or is that made before and has to be what is in the IR stream already - //option.structurize_arrays = command_line_arguments.get_structurize_arrays(); + // Is this an option they can make after IR or is that made before and has to be what is in the + // IR stream already option.structurize_arrays = + // command_line_arguments.get_structurize_arrays(); auto const& db_config_container = command_line_arguments.get_metadata_db_config(); if (db_config_container.has_value()) { @@ -462,7 +459,7 @@ int main(int argc, char const* argv[]) { if (false == generate_IR(command_line_arguments)) { return 1; } - }else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { + } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { auto const& archives_dir = command_line_arguments.get_archives_dir(); if (false == std::filesystem::is_directory(archives_dir)) { SPDLOG_ERROR("'{}' is not a directory.", archives_dir); From 4dce1607d8dd3a4098153167061f85314815e1b4 Mon Sep 17 00:00:00 2001 From: AVMatthews Date: Sat, 21 Sep 2024 07:16:01 -0400 Subject: [PATCH 04/15] small linting fix --- components/core/src/clp_s/CommandLineArguments.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 0435cd3f6..530dad3fb 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -352,8 +352,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { std::cerr << "Examples:" << std::endl; std::cerr << " # Compress file1.ir and dir1 into archives-dir" << std::endl; - std::cerr << " " << m_program_name << " i archives-dir file1.ir dir1" - << std::endl; + std::cerr << " " << m_program_name << " i archives-dir file1.ir dir1" << std::endl; po::options_description visible_options; visible_options.add(general_options); @@ -471,8 +470,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { std::cerr << "Examples:" << std::endl; std::cerr << " # Parse file1.json and dir1 into irs-dir" << std::endl; - std::cerr << " " << m_program_name << " r irs-dir file1.json dir1" - << std::endl; + std::cerr << " " << m_program_name << " r irs-dir file1.json dir1" << std::endl; po::options_description visible_options; visible_options.add(general_options); @@ -1042,8 +1040,7 @@ void CommandLineArguments::print_search_usage() const { } void CommandLineArguments::print_json_to_IR_usage() const { - std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" - << std::endl; + std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl; } void CommandLineArguments::print_IR_compression_usage() const { From 424cb857255edf28db7a57a80c144c8b9fa11ad8 Mon Sep 17 00:00:00 2001 From: AVMatthews Date: Mon, 23 Sep 2024 13:40:38 -0400 Subject: [PATCH 05/15] modified IR file I/O to use Decompressor --- components/core/src/clp_s/CMakeLists.txt | 6 +++++ components/core/src/clp_s/JsonParser.cpp | 33 +++++------------------- components/core/src/clp_s/JsonParser.hpp | 1 + 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 7b684632c..81ea74f70 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -63,6 +63,12 @@ set( ../clp/ffi/utils.cpp ../clp/utf8_utils.hpp ../clp/utf8_utils.cpp + ../clp/streaming_compression/zstd/Decompressor.hpp + ../clp/streaming_compression/zstd/Decompressor.cpp + ../clp/ReadOnlyMemoryMappedFile.hpp + ../clp/ReadOnlyMemoryMappedFile.cpp + ../clp/FileDescriptor.hpp + ../clp/FileDescriptor.cpp ) set( diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 6647d3d7a..95e788e6d 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -760,38 +760,15 @@ bool JsonParser::parse_from_IR() { m_archive_writer->add_node(-1, NodeType::Unknown, "root"); for (auto& file_path : m_file_paths) { - std::cout << file_path << std::endl; - std::vector ir_buf; - char temp_ir_buf[10'000]; - FileReader infile; - infile.open(file_path); - if (false == infile.is_open()) { - m_archive_writer->close(); - return false; - } int fsize = std::filesystem::file_size(file_path); if (0 == fsize) { m_archive_writer->close(); return false; } - ZstdDecompressor zd; - zd.open(infile, fsize); - size_t num_bytes_read = 0; - do { - num_bytes_read = 0; - zd.try_read(temp_ir_buf, 10'000, num_bytes_read); - if (num_bytes_read != 0) { - ir_buf.insert(ir_buf.end(), temp_ir_buf, temp_ir_buf + num_bytes_read); - } - } while (num_bytes_read == 10'000); - zd.close(); - infile.close(); + clp::streaming_compression::zstd::Decompressor zd; + zd.open(file_path); - BufferReader reader{size_checked_pointer_cast(ir_buf.data()), ir_buf.size()}; - char const* p; - size_t p_size; - - auto deserializer_result = Deserializer::create(reader); + auto deserializer_result = Deserializer::create(zd); if (deserializer_result.has_error()) { m_archive_writer->close(); return false; @@ -800,7 +777,7 @@ bool JsonParser::parse_from_IR() { m_num_messages = 0; do { - auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(reader); + auto const kv_log_event_result = deserializer.deserialize_to_next_log_event(zd); if (kv_log_event_result.has_error()) { if (kv_log_event_result.error() == std::errc::no_message_available @@ -828,6 +805,8 @@ bool JsonParser::parse_from_IR() { } while (1); id_conversion_cache.clear(); + zd.close(); + //infile.close(); } return true; } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 1e1aa96fa..62c54df8d 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -31,6 +31,7 @@ #include "TimestampDictionaryWriter.hpp" #include "Utils.hpp" #include "ZstdCompressor.hpp" +#include "../clp/streaming_compression/zstd/Decompressor.hpp" using clp::BufferReader; using clp::ffi::ir_stream::Deserializer; From 1899c619bd1ceb8e1e9476c142bdd4d356fea2af Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Tue, 24 Sep 2024 16:20:14 -0400 Subject: [PATCH 06/15] updates from first round of review, and linting --- components/core/CMakeLists.txt | 6 - components/core/src/clp_s/CMakeLists.txt | 80 ++++----- components/core/src/clp_s/JsonParser.cpp | 199 ++++++++--------------- components/core/src/clp_s/JsonParser.hpp | 2 +- components/core/src/clp_s/clp-s.cpp | 41 ++--- 5 files changed, 123 insertions(+), 205 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 9ff3527ae..a7c6d5a90 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -278,12 +278,6 @@ set(SOURCE_FILES_clp_s_unitTest src/clp_s/TimestampPattern.hpp src/clp_s/Utils.cpp src/clp_s/Utils.hpp - src/clp_s/ZstdCompressor.hpp - src/clp_s/ZstdCompressor.cpp - src/clp_s/ZstdDecompressor.hpp - src/clp_s/ZstdDecompressor.cpp - src/clp_s/FileWriter.cpp - src/clp_s/FileReader.cpp ) set(SOURCE_FILES_unitTest diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 81ea74f70..85a93afbb 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -2,12 +2,13 @@ add_subdirectory(search/kql) set( CLP_SOURCES - ../clp/cli_utils.cpp - ../clp/cli_utils.hpp - ../clp/database_utils.cpp - ../clp/database_utils.hpp + ../clp/BufferReader.cpp + ../clp/BufferReader.hpp ../clp/Defs.h ../clp/ErrorCode.hpp + ../clp/ErrorCode.hpp + ../clp/FileDescriptor.cpp + ../clp/FileDescriptor.hpp ../clp/GlobalMetadataDB.hpp ../clp/GlobalMetadataDBConfig.cpp ../clp/GlobalMetadataDBConfig.hpp @@ -19,56 +20,55 @@ set( ../clp/MySQLParamBindings.hpp ../clp/MySQLPreparedStatement.cpp ../clp/MySQLPreparedStatement.hpp - ../clp/networking/socket_utils.cpp - ../clp/networking/socket_utils.hpp + ../clp/ReadOnlyMemoryMappedFile.cpp + ../clp/ReadOnlyMemoryMappedFile.hpp + ../clp/ReaderInterface.cpp ../clp/ReaderInterface.cpp ../clp/ReaderInterface.hpp - ../clp/streaming_archive/ArchiveMetadata.cpp - ../clp/streaming_archive/ArchiveMetadata.hpp + ../clp/ReaderInterface.hpp ../clp/TraceableException.hpp ../clp/WriterInterface.cpp ../clp/WriterInterface.hpp - ../clp/ffi/ir_stream/Deserializer.hpp - ../clp/ffi/ir_stream/Deserializer.cpp - ../clp/ffi/ir_stream/Serializer.hpp - ../clp/ffi/ir_stream/Serializer.cpp - ../clp/ffi/ir_stream/utils.hpp - ../clp/ffi/ir_stream/utils.cpp - ../clp/BufferReader.hpp - ../clp/BufferReader.cpp - ../clp/type_utils.hpp - ../clp/ffi/Value.hpp - ../clp/ErrorCode.hpp - ../clp/ir/EncodedTextAst.hpp - ../clp/ir/EncodedTextAst.cpp - ../clp/ir/types.hpp - ../clp/ReaderInterface.hpp - ../clp/ReaderInterface.cpp - ../clp/time_types.hpp - ../clp/type_utils.hpp - ../clp/ffi/KeyValuePairLogEvent.hpp + ../clp/cli_utils.cpp + ../clp/cli_utils.hpp + ../clp/database_utils.cpp + ../clp/database_utils.hpp ../clp/ffi/KeyValuePairLogEvent.cpp - ../clp/ffi/SchemaTree.hpp + ../clp/ffi/KeyValuePairLogEvent.hpp ../clp/ffi/SchemaTree.cpp + ../clp/ffi/SchemaTree.hpp ../clp/ffi/SchemaTreeNode.hpp ../clp/ffi/Value.hpp - ../clp/ffi/ir_stream/decoding_methods.hpp + ../clp/ffi/Value.hpp + ../clp/ffi/ir_stream/Deserializer.cpp + ../clp/ffi/ir_stream/Deserializer.hpp + ../clp/ffi/ir_stream/Serializer.cpp + ../clp/ffi/ir_stream/Serializer.hpp ../clp/ffi/ir_stream/decoding_methods.cpp - ../clp/ffi/ir_stream/encoding_methods.hpp + ../clp/ffi/ir_stream/decoding_methods.hpp ../clp/ffi/ir_stream/encoding_methods.cpp - ../clp/ir/parsing.hpp - ../clp/ir/parsing.cpp + ../clp/ffi/ir_stream/encoding_methods.hpp ../clp/ffi/ir_stream/protocol_constants.hpp - ../clp/ffi/utils.hpp + ../clp/ffi/ir_stream/utils.cpp + ../clp/ffi/ir_stream/utils.hpp ../clp/ffi/utils.cpp - ../clp/utf8_utils.hpp - ../clp/utf8_utils.cpp - ../clp/streaming_compression/zstd/Decompressor.hpp + ../clp/ffi/utils.hpp + ../clp/ir/EncodedTextAst.cpp + ../clp/ir/EncodedTextAst.hpp + ../clp/ir/parsing.cpp + ../clp/ir/parsing.hpp + ../clp/ir/types.hpp + ../clp/networking/socket_utils.cpp + ../clp/networking/socket_utils.hpp + ../clp/streaming_archive/ArchiveMetadata.cpp + ../clp/streaming_archive/ArchiveMetadata.hpp ../clp/streaming_compression/zstd/Decompressor.cpp - ../clp/ReadOnlyMemoryMappedFile.hpp - ../clp/ReadOnlyMemoryMappedFile.cpp - ../clp/FileDescriptor.hpp - ../clp/FileDescriptor.cpp + ../clp/streaming_compression/zstd/Decompressor.hpp + ../clp/time_types.hpp + ../clp/type_utils.hpp + ../clp/type_utils.hpp + ../clp/utf8_utils.cpp + ../clp/utf8_utils.hpp ) set( diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 95e788e6d..ffa0d840b 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,6 +1,5 @@ #include "JsonParser.hpp" -#include #include #include @@ -522,170 +521,109 @@ bool JsonParser::parse() { } NodeType get_archive_node_type( - clp::ffi::SchemaTreeNode const& node, - std::pair> p + clp::ffi::SchemaTreeNode::Type ir_node_type, + bool node_has_value, + std::optional const& node_value ) { - auto const node_type = node.get_type(); // figure out what type the node is in archive node type - NodeType archiveNodeType; - switch (node_type) { + NodeType archive_node_type; + switch (ir_node_type) { case clp::ffi::SchemaTreeNode::Type::Int: - archiveNodeType = NodeType::Integer; + archive_node_type = NodeType::Integer; break; case clp::ffi::SchemaTreeNode::Type::Float: - archiveNodeType = NodeType::Float; + archive_node_type = NodeType::Float; break; case clp::ffi::SchemaTreeNode::Type::Bool: - archiveNodeType = NodeType::Boolean; + archive_node_type = NodeType::Boolean; break; case clp::ffi::SchemaTreeNode::Type::UnstructuredArray: - archiveNodeType = NodeType::UnstructuredArray; + archive_node_type = NodeType::UnstructuredArray; break; case clp::ffi::SchemaTreeNode::Type::Str: - // std::cerr << "In str\n"; - if (p.second.value().is()) { - // maybe special case for date string - archiveNodeType = NodeType::VarString; + if (node_value->is()) { + archive_node_type = NodeType::VarString; } else { - archiveNodeType = NodeType::ClpString; + archive_node_type = NodeType::ClpString; } break; case clp::ffi::SchemaTreeNode::Type::Obj: - // std::cerr << "In obj\n"; - if (p.second.has_value()) { - if (p.second.value().is_null()) { - // std::cout << "Found Null\n"; - archiveNodeType = NodeType::NullValue; + if (node_has_value) { + if (node_value->is_null()) { + archive_node_type = NodeType::NullValue; } else { - archiveNodeType = NodeType::Object; + archive_node_type = NodeType::Object; } } else { - archiveNodeType = NodeType::Object; + archive_node_type = NodeType::Object; } break; default: - archiveNodeType = NodeType::Unknown; + archive_node_type = NodeType::Unknown; break; } - return archiveNodeType; + return archive_node_type; } // int JsonParser::get_archive_node_id( - std::map, int>& cache, - int irNodeID, - NodeType archiveNodeType, - clp::ffi::SchemaTree const& irTree + std::map, int32_t>& ir_node_to_archive_node_map, + int ir_node_id, + NodeType archive_node_type, + clp::ffi::SchemaTree const& ir_tree ) { - std::tuple key(irNodeID, archiveNodeType); - if (cache.find(key) != cache.end()) { - return cache[key]; + auto key = std::make_tuple(ir_node_id, archive_node_type); + auto map_location = ir_node_to_archive_node_map.find(key); + if (ir_node_to_archive_node_map.end() != map_location) { + return map_location->second; } - auto& currNode = irTree.get_node(irNodeID); - int parent_node_id; - // Found the root - if (currNode.get_parent_id() == 0) { - parent_node_id = 0; - } else { - parent_node_id - = get_archive_node_id(cache, currNode.get_parent_id(), NodeType::Object, irTree); + auto& curr_node = ir_tree.get_node(ir_node_id); + int32_t parent_node_id{0}; + if (0 != curr_node.get_parent_id()) { + parent_node_id = get_archive_node_id( + ir_node_to_archive_node_map, + curr_node.get_parent_id(), + NodeType::Object, + ir_tree + ); } - std::string nodeKey - = clp::ffi::validate_and_escape_utf8_string(currNode.get_key_name()).value(); - int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archiveNodeType, nodeKey); - cache[key] = curr_node_archive_id; - return curr_node_archive_id; -} - -void print_kv_log_event(KeyValuePairLogEvent const& kv) { - auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); - std::cout << "number of kv pairs: " << num_kv_pairs << std::endl; - auto const& tree = kv.get_schema_tree(); - for (auto const& pair : kv.get_node_id_value_pairs()) { - auto const& tree_node = tree.get_node(pair.first); - auto const node_type = tree_node.get_type(); - switch (node_type) { - case clp::ffi::SchemaTreeNode::Type::Int: - std::cout << "Int" << std::endl; - break; - case clp::ffi::SchemaTreeNode::Type::Float: - std::cout << "Float" << std::endl; - break; - case clp::ffi::SchemaTreeNode::Type::Bool: - std::cout << "Bool" << std::endl; - break; - case clp::ffi::SchemaTreeNode::Type::Str: - std::cout << "Str" << std::endl; - break; - case clp::ffi::SchemaTreeNode::Type::UnstructuredArray: - std::cout << "UArray" << std::endl; - break; - case clp::ffi::SchemaTreeNode::Type::Obj: - std::cout << "Obj" << std::endl; - break; - default: - std::cout << "???" << std::endl; - break; - } - - if (!pair.second.has_value()) { - std::cout << "{??:\t" << pair.first << ": Node doesn't have Value ... EMPTY OBJ}\n"; - continue; - } - if (pair.second.value().is()) { - std::cout << "{INT:\t" << pair.first << ": " - << pair.second.value().get_immutable_view() << "}\n"; - } else if (pair.second.value().is()) { - std::cout << "{FLOAT:\t" << pair.first << ": " - << pair.second.value().get_immutable_view() << "}\n"; - } else if (pair.second.value().is()) { - std::cout << "{BOOL:\t" << pair.first << ": " - << pair.second.value().get_immutable_view() << "}\n"; - } else if (pair.second.value().is()) { - std::cout << "{STRING:\t" << pair.first << ": " - << pair.second.value().get_immutable_view() << "}\n"; - } else if (pair.second.value().is()) { - std::cout << "{EIGHTByte:\t" << pair.first << ": \n"; - auto decoded = pair.second.value() - .get_immutable_view() - .decode_and_unparse(); - if (std::nullopt != decoded) { - std::cout << "\t Decoded & Unparsed: " << decoded.value() << std::endl; - } else { - std::cout << "\tNULL\n"; - } - std::cout << "}\n"; - } else if (pair.second.value().is()) { - std::cout << "{FOURByte:\t" << pair.first << ": \n"; - auto decoded = pair.second.value() - .get_immutable_view() - .decode_and_unparse(); - if (std::nullopt != decoded) { - std::cout << "\tDecoded & Unparsed: " << decoded.value() << std::endl; - } else { - std::cout << "\tNULL\n"; - } - std::cout << "}\n"; - } else { - std::cout << "Unknown Type:\t" << pair.first << "\n"; - } + auto validated_escaped_key + = clp::ffi::validate_and_escape_utf8_string(curr_node.get_key_name()); + std::string node_key = ""; + if (validated_escaped_key.has_value()) { + node_key = validated_escaped_key.value(); } - std::cout << "after for loop\n\n\n"; + int curr_node_archive_id + = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); + ir_node_to_archive_node_map.emplace(std::move(key), curr_node_archive_id); + return curr_node_archive_id; } void JsonParser::parse_kv_log_event( KeyValuePairLogEvent const& kv, - std::map, int>& cache + std::map, int32_t>& ir_node_to_archive_node_map ) { - auto const num_kv_pairs = kv.get_node_id_value_pairs().size(); clp::ffi::SchemaTree const& tree = kv.get_schema_tree(); for (auto const& pair : kv.get_node_id_value_pairs()) { clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first); - NodeType archiveNodeType = get_archive_node_type(tree_node, pair); - int node_id = get_archive_node_id(cache, pair.first, archiveNodeType, tree); + clp::ffi::SchemaTreeNode::Type ir_node_type = tree_node.get_type(); + bool node_has_value = pair.second.has_value(); + NodeType archive_node_type = NodeType::Unknown; + if (node_has_value) { + archive_node_type + = get_archive_node_type(ir_node_type, node_has_value, pair.second.value()); + } else { + archive_node_type = get_archive_node_type(ir_node_type, node_has_value, {}); + } + int node_id = get_archive_node_id( + ir_node_to_archive_node_map, + pair.first, + archive_node_type, + tree + ); - switch (archiveNodeType) { + switch (archive_node_type) { case NodeType::Integer: { int64_t i64_value = pair.second.value().get_immutable_view(); m_current_parsed_message.add_value(node_id, i64_value); @@ -756,7 +694,7 @@ void JsonParser::parse_kv_log_event( } bool JsonParser::parse_from_IR() { - std::map, int> id_conversion_cache; + std::map, int32_t> ir_node_to_archive_node_map; m_archive_writer->add_node(-1, NodeType::Unknown, "root"); for (auto& file_path : m_file_paths) { @@ -790,23 +728,20 @@ bool JsonParser::parse_from_IR() { m_current_schema.clear(); auto const& kv_log_event = kv_log_event_result.value(); - // print_kv_log_event(kv_log_event); - parse_kv_log_event(kv_log_event, id_conversion_cache); + parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map); m_num_messages++; if (m_archive_writer->get_data_size() >= m_target_encoded_size) { - std::cerr << "Splitting Archive\n\n"; - id_conversion_cache.clear(); + ir_node_to_archive_node_map.clear(); m_archive_writer->add_node(-1, NodeType::Unknown, "root"); split_archive(); } m_current_parsed_message.clear(); - } while (1); - id_conversion_cache.clear(); + } while (true); + ir_node_to_archive_node_map.clear(); zd.close(); - //infile.close(); } return true; } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 62c54df8d..46538c176 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -18,6 +18,7 @@ #include "../clp/ffi/Value.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" #include "../clp/ir/types.hpp" +#include "../clp/streaming_compression/zstd/Decompressor.hpp" #include "../clp/type_utils.hpp" #include "ArchiveWriter.hpp" #include "DictionaryWriter.hpp" @@ -31,7 +32,6 @@ #include "TimestampDictionaryWriter.hpp" #include "Utils.hpp" #include "ZstdCompressor.hpp" -#include "../clp/streaming_compression/zstd/Decompressor.hpp" using clp::BufferReader; using clp::ffi::ir_stream::Deserializer; diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 911e92c1d..813f314df 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -156,7 +156,6 @@ auto unpack_and_serialize_msgpack_bytes( template auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { - // std::cout << "Running Serializer\n"; auto result{Serializer::create()}; if (result.has_error()) { SPDLOG_ERROR("Failed to create Serializer"); @@ -166,45 +165,43 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { std::vector ir_buf; flush_and_clear_serializer_buffer(serializer, ir_buf); - std::ifstream inFile; - inFile.open(path, std::ifstream::in); - // std::cout << "Opened Input file\n"; + std::ifstream in_file; + in_file.open(path, std::ifstream::in); - std::string outPath = ""; + std::string out_path = ""; int index = path.find_last_of('/'); if (std::string::npos == index) { - outPath = option.irs_dir + "/" + path + ".ir"; + out_path = option.irs_dir + "/" + path + ".ir"; } else { - outPath = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir"; + out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir"; } - clp_s::FileWriter outFile; - // std::cout << outPath << "\n"; - outFile.open(outPath, clp_s::FileWriter::OpenMode::CreateForWriting); + clp_s::FileWriter out_file; + out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting); clp_s::ZstdCompressor zc; - zc.open(outFile, option.compression_level); + zc.open(out_file, option.compression_level); std::string line; - size_t totalSize = 0; + size_t total_size = 0; - if (inFile.is_open()) { - while (getline(inFile, line)) { + if (in_file.is_open()) { + while (getline(in_file, line)) { auto j_obj = nlohmann::json::parse(line); unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer); flush_and_clear_serializer_buffer(serializer, ir_buf); if (ir_buf.size() >= 1'000'000'000) { - totalSize = totalSize + ir_buf.size(); + total_size = total_size + ir_buf.size(); zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); zc.flush(); ir_buf.clear(); } } - totalSize = totalSize + ir_buf.size(); + total_size = total_size + ir_buf.size(); zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); zc.flush(); ir_buf.clear(); - inFile.close(); + in_file.close(); zc.close(); - outFile.close(); + out_file.close(); } return true; @@ -223,11 +220,9 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) { clp_s::JsonToIRParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); option.irs_dir = irs_dir.string(); - // std::cout << "IRs dir: " << option.irs_dir << std::endl; option.max_document_size = command_line_arguments.get_max_document_size(); option.compression_level = command_line_arguments.get_compression_level(); option.encoding = command_line_arguments.get_encoding_type(); - // std::cout << "encoding type: " << static_cast(option.encoding) << std::endl; if (false == clp_s::FileUtils::validate_path(option.file_paths)) { exit(1); @@ -241,10 +236,8 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) { for (auto& path : all_file_paths) { bool success; if (option.encoding == 4) { - // std::cout << "four byte\n"; success = run_serializer(option, path); } else { - // std::cout << "eight byte\n"; success = run_serializer(option, path); } if (false == success) { @@ -273,14 +266,10 @@ bool IR_compress(CommandLineArguments const& command_line_arguments) { option.file_paths = command_line_arguments.get_file_paths(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); - // Do I need max_document_size() option.max_document_size = command_line_arguments.get_max_document_size(); option.compression_level = command_line_arguments.get_compression_level(); option.timestamp_key = command_line_arguments.get_timestamp_key(); option.print_archive_stats = command_line_arguments.print_archive_stats(); - // Is this an option they can make after IR or is that made before and has to be what is in the - // IR stream already option.structurize_arrays = - // command_line_arguments.get_structurize_arrays(); auto const& db_config_container = command_line_arguments.get_metadata_db_config(); if (db_config_container.has_value()) { From 2a1b9291f1683b6b0251e56c8b495c1cfc63c0d5 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Wed, 25 Sep 2024 17:27:22 -0400 Subject: [PATCH 07/15] remove implicit root add --- components/core/src/clp_s/JsonParser.cpp | 8 ++++---- components/core/src/clp_s/JsonParser.hpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index ffa0d840b..2bc652ccc 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -578,8 +578,8 @@ int JsonParser::get_archive_node_id( return map_location->second; } auto& curr_node = ir_tree.get_node(ir_node_id); - int32_t parent_node_id{0}; - if (0 != curr_node.get_parent_id()) { + int32_t parent_node_id{-1}; + if (ir_node_id != curr_node.get_parent_id()) { parent_node_id = get_archive_node_id( ir_node_to_archive_node_map, curr_node.get_parent_id(), @@ -695,7 +695,7 @@ void JsonParser::parse_kv_log_event( bool JsonParser::parse_from_IR() { std::map, int32_t> ir_node_to_archive_node_map; - m_archive_writer->add_node(-1, NodeType::Unknown, "root"); + //m_archive_writer->add_node(-1, NodeType::Unknown, "root"); for (auto& file_path : m_file_paths) { int fsize = std::filesystem::file_size(file_path); @@ -733,7 +733,7 @@ bool JsonParser::parse_from_IR() { m_num_messages++; if (m_archive_writer->get_data_size() >= m_target_encoded_size) { ir_node_to_archive_node_map.clear(); - m_archive_writer->add_node(-1, NodeType::Unknown, "root"); + //m_archive_writer->add_node(-1, NodeType::Unknown, "root"); split_archive(); } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 46538c176..b48d1bbcd 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -86,7 +86,7 @@ class JsonParser { /** * Parses the Key Value IR Stream and stores the data in the archive. - * @return whether the IR Stream was parsed succesfully + * @return whether the IR Stream was parsed successfully */ [[nodiscard]] bool parse_from_IR(); From f5005ecb53f0f758a121e805757ccd986ed48618 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Fri, 27 Sep 2024 19:24:26 -0400 Subject: [PATCH 08/15] Added more error handling and handled most of the remaining comments --- components/core/src/clp_s/CMakeLists.txt | 4 -- components/core/src/clp_s/JsonParser.cpp | 83 +++++++++++++++--------- components/core/src/clp_s/clp-s.cpp | 58 ++++++++++++----- 3 files changed, 93 insertions(+), 52 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 85a93afbb..477736362 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -23,8 +23,6 @@ set( ../clp/ReadOnlyMemoryMappedFile.cpp ../clp/ReadOnlyMemoryMappedFile.hpp ../clp/ReaderInterface.cpp - ../clp/ReaderInterface.cpp - ../clp/ReaderInterface.hpp ../clp/ReaderInterface.hpp ../clp/TraceableException.hpp ../clp/WriterInterface.cpp @@ -39,7 +37,6 @@ set( ../clp/ffi/SchemaTree.hpp ../clp/ffi/SchemaTreeNode.hpp ../clp/ffi/Value.hpp - ../clp/ffi/Value.hpp ../clp/ffi/ir_stream/Deserializer.cpp ../clp/ffi/ir_stream/Deserializer.hpp ../clp/ffi/ir_stream/Serializer.cpp @@ -66,7 +63,6 @@ set( ../clp/streaming_compression/zstd/Decompressor.hpp ../clp/time_types.hpp ../clp/type_utils.hpp - ../clp/type_utils.hpp ../clp/utf8_utils.cpp ../clp/utf8_utils.hpp ) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 2bc652ccc..32e078f6f 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -541,7 +541,7 @@ NodeType get_archive_node_type( archive_node_type = NodeType::UnstructuredArray; break; case clp::ffi::SchemaTreeNode::Type::Str: - if (node_value->is()) { + if (node_value && node_value->is()) { archive_node_type = NodeType::VarString; } else { archive_node_type = NodeType::ClpString; @@ -592,6 +592,8 @@ int JsonParser::get_archive_node_id( std::string node_key = ""; if (validated_escaped_key.has_value()) { node_key = validated_escaped_key.value(); + } else { + throw "Key is not utf8 compliant"; } int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); @@ -616,12 +618,17 @@ void JsonParser::parse_kv_log_event( } else { archive_node_type = get_archive_node_type(ir_node_type, node_has_value, {}); } - int node_id = get_archive_node_id( - ir_node_to_archive_node_map, - pair.first, - archive_node_type, - tree - ); + int node_id; + try { + node_id = get_archive_node_id( + ir_node_to_archive_node_map, + pair.first, + archive_node_type, + tree + ); + } catch (...) { + throw; + } switch (archive_node_type) { case NodeType::Integer: { @@ -637,30 +644,38 @@ void JsonParser::parse_kv_log_event( m_current_parsed_message.add_value(node_id, b_value); } break; case NodeType::VarString: { - std::string str = clp::ffi::validate_and_escape_utf8_string( - pair.second.value().get_immutable_view() - ) - .value(); + auto validated_escaped_string = clp::ffi::validate_and_escape_utf8_string( + pair.second.value().get_immutable_view() + ); + std::string str = ""; + if (validated_escaped_string.has_value()) { + str = validated_escaped_string.value(); + } else { + throw "String is not utf8 compliant"; + } m_current_parsed_message.add_value(node_id, str); } break; case NodeType::ClpString: { - std::string encoded_str; + std::string encoded_str = ""; + std::string decodedValue = ""; if (pair.second.value().is()) { - std::string decodedValue - = pair.second.value() - .get_immutable_view() - .decode_and_unparse() - .value(); - encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()) - .value(); + decodedValue = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } else { - std::string decodedValue - = pair.second.value() - .get_immutable_view() - .decode_and_unparse() - .value(); - encoded_str = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()) - .value(); + decodedValue = pair.second.value() + .get_immutable_view() + .decode_and_unparse() + .value(); + } + auto validated_escaped_encoded_string + = clp::ffi::validate_and_escape_utf8_string(decodedValue.c_str()); + if (validated_escaped_encoded_string.has_value()) { + encoded_str = validated_escaped_encoded_string.value(); + } else { + throw "Encoded string is not utf8 compliant"; } m_current_parsed_message.add_value(node_id, encoded_str); } break; @@ -695,7 +710,6 @@ void JsonParser::parse_kv_log_event( bool JsonParser::parse_from_IR() { std::map, int32_t> ir_node_to_archive_node_map; - //m_archive_writer->add_node(-1, NodeType::Unknown, "root"); for (auto& file_path : m_file_paths) { int fsize = std::filesystem::file_size(file_path); @@ -727,13 +741,20 @@ bool JsonParser::parse_from_IR() { m_current_schema.clear(); auto const& kv_log_event = kv_log_event_result.value(); - - parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map); - + try { + parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map); + } catch (std::string msg) { + SPDLOG_ERROR("ERROR: {}" + msg); + zd.close(); + return false; + } catch (...) { + SPDLOG_ERROR("ERROR: Encountered error while parsing a kv log event"); + zd.close(); + return false; + } m_num_messages++; if (m_archive_writer->get_data_size() >= m_target_encoded_size) { ir_node_to_archive_node_map.clear(); - //m_archive_writer->add_node(-1, NodeType::Unknown, "root"); split_archive(); } diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 813f314df..a4bf62825 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -143,15 +143,20 @@ auto unpack_and_serialize_msgpack_bytes( std::vector const& msgpack_bytes, Serializer& serializer ) -> bool { - auto const msgpack_obj_handle{msgpack::unpack( - clp::size_checked_pointer_cast(msgpack_bytes.data()), - msgpack_bytes.size() - )}; - auto const msgpack_obj{msgpack_obj_handle.get()}; - if (msgpack::type::MAP != msgpack_obj.type) { + try { + auto const msgpack_obj_handle{msgpack::unpack( + clp::size_checked_pointer_cast(msgpack_bytes.data()), + msgpack_bytes.size() + )}; + auto const msgpack_obj{msgpack_obj_handle.get()}; + if (msgpack::type::MAP != msgpack_obj.type) { + return false; + } + return serializer.serialize_msgpack_map(msgpack_obj.via.map); + } catch (std::exception const& e) { + SPDLOG_ERROR("Failed to unpack msgpack bytes: {}", e.what()); return false; } - return serializer.serialize_msgpack_map(msgpack_obj.via.map); } template @@ -168,13 +173,17 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { std::ifstream in_file; in_file.open(path, std::ifstream::in); - std::string out_path = ""; + /* std::string out_path = ""; int index = path.find_last_of('/'); if (std::string::npos == index) { out_path = option.irs_dir + "/" + path + ".ir"; } else { out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir"; - } + } */ + std::filesystem::path input_path{path}; + std::string filename = input_path.filename().string(); + std::string out_path = option.irs_dir + "/" + filename + ".ir"; + clp_s::FileWriter out_file; out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting); clp_s::ZstdCompressor zc; @@ -185,14 +194,29 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { if (in_file.is_open()) { while (getline(in_file, line)) { - auto j_obj = nlohmann::json::parse(line); - unpack_and_serialize_msgpack_bytes(nlohmann::json::to_msgpack(j_obj), serializer); - flush_and_clear_serializer_buffer(serializer, ir_buf); - if (ir_buf.size() >= 1'000'000'000) { - total_size = total_size + ir_buf.size(); - zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); - zc.flush(); - ir_buf.clear(); + try { + auto j_obj = nlohmann::json::parse(line); + if (!unpack_and_serialize_msgpack_bytes( + nlohmann::json::to_msgpack(j_obj), + serializer + )) + { + SPDLOG_ERROR("Failed to serialize msgpack bytes for line: {}", line); + return false; + } + flush_and_clear_serializer_buffer(serializer, ir_buf); + if (ir_buf.size() >= 1'000'000'000) { + total_size = total_size + ir_buf.size(); + zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); + zc.flush(); + ir_buf.clear(); + } + } catch (nlohmann::json::parse_error const& e) { + SPDLOG_ERROR("JSON parsing error: {}", e.what()); + return false; + } catch (std::exception const& e) { + SPDLOG_ERROR("Error during serialization: {}", e.what()); + return false; } } total_size = total_size + ir_buf.size(); From 5c2866881ae72c6e3911f26955ac71c654d223ed Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Thu, 3 Oct 2024 10:31:23 -0400 Subject: [PATCH 09/15] modifications requested by coderabbitai --- components/core/src/clp_s/CMakeLists.txt | 1 - components/core/src/clp_s/clp-s.cpp | 13 +++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 477736362..f697db731 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -6,7 +6,6 @@ set( ../clp/BufferReader.hpp ../clp/Defs.h ../clp/ErrorCode.hpp - ../clp/ErrorCode.hpp ../clp/FileDescriptor.cpp ../clp/FileDescriptor.hpp ../clp/GlobalMetadataDB.hpp diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index a4bf62825..7bd1519a9 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -58,7 +58,7 @@ bool compress(CommandLineArguments const& command_line_arguments); * @param command_line_arguments * @return Whether compression was successful */ -bool IR_compress(CommandLineArguments const& command_line_arguments); +bool ir_compress(CommandLineArguments const& command_line_arguments); /** * Decompresses the archive specified by the given JsonConstructorOption. @@ -187,7 +187,12 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { clp_s::FileWriter out_file; out_file.open(out_path, clp_s::FileWriter::OpenMode::CreateForWriting); clp_s::ZstdCompressor zc; - zc.open(out_file, option.compression_level); + try { + zc.open(out_file, option.compression_level); + } catch (clp_s::ZstdCompressor::OperationFailed& error) { + SPDLOG_ERROR("Failed to open ZSTDcompressor - {}", error.what()); + return false; + } std::string line; size_t total_size = 0; @@ -271,7 +276,7 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) { return true; } -bool IR_compress(CommandLineArguments const& command_line_arguments) { +bool ir_compress(CommandLineArguments const& command_line_arguments) { auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); // Create output directory in case it doesn't exist @@ -465,7 +470,7 @@ int main(int argc, char const* argv[]) { return 1; } } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) { - if (false == IR_compress(command_line_arguments)) { + if (false == ir_compress(command_line_arguments)) { return 1; } } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) { From 2facdd0f99c45193a6fa2f22bf91702d8b746bb8 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Thu, 3 Oct 2024 15:00:42 -0400 Subject: [PATCH 10/15] main merged into branch --- components/core/src/clp_s/CMakeLists.txt | 2 ++ components/core/src/clp_s/clp-s.cpp | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index f697db731..cca256489 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -44,6 +44,8 @@ set( ../clp/ffi/ir_stream/decoding_methods.hpp ../clp/ffi/ir_stream/encoding_methods.cpp ../clp/ffi/ir_stream/encoding_methods.hpp + ../clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp + ../clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp ../clp/ffi/ir_stream/protocol_constants.hpp ../clp/ffi/ir_stream/utils.cpp ../clp/ffi/ir_stream/utils.hpp diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 7bd1519a9..49bd148bb 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -173,13 +173,6 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { std::ifstream in_file; in_file.open(path, std::ifstream::in); - /* std::string out_path = ""; - int index = path.find_last_of('/'); - if (std::string::npos == index) { - out_path = option.irs_dir + "/" + path + ".ir"; - } else { - out_path = option.irs_dir + "/" + path.substr(index, path.length() - index) + ".ir"; - } */ std::filesystem::path input_path{path}; std::string filename = input_path.filename().string(); std::string out_path = option.irs_dir + "/" + filename + ".ir"; From 303d4cbff313f1c52e8325fecc483318cb96da4c Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 7 Oct 2024 15:28:57 -0400 Subject: [PATCH 11/15] map converted to unordered_map and various documentation, linting, and clang-tidying --- components/core/src/clp_s/CMakeLists.txt | 44 +++++------ .../core/src/clp_s/CommandLineArguments.cpp | 28 +++---- .../core/src/clp_s/CommandLineArguments.hpp | 8 +- components/core/src/clp_s/JsonParser.cpp | 78 ++++++++++++------- components/core/src/clp_s/JsonParser.hpp | 66 ++++++++-------- components/core/src/clp_s/clp-s.cpp | 59 +++++++++++--- 6 files changed, 172 insertions(+), 111 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index cca256489..948a9d701 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -4,32 +4,12 @@ set( CLP_SOURCES ../clp/BufferReader.cpp ../clp/BufferReader.hpp - ../clp/Defs.h - ../clp/ErrorCode.hpp - ../clp/FileDescriptor.cpp - ../clp/FileDescriptor.hpp - ../clp/GlobalMetadataDB.hpp - ../clp/GlobalMetadataDBConfig.cpp - ../clp/GlobalMetadataDBConfig.hpp - ../clp/GlobalMySQLMetadataDB.cpp - ../clp/GlobalMySQLMetadataDB.hpp - ../clp/MySQLDB.cpp - ../clp/MySQLDB.hpp - ../clp/MySQLParamBindings.cpp - ../clp/MySQLParamBindings.hpp - ../clp/MySQLPreparedStatement.cpp - ../clp/MySQLPreparedStatement.hpp - ../clp/ReadOnlyMemoryMappedFile.cpp - ../clp/ReadOnlyMemoryMappedFile.hpp - ../clp/ReaderInterface.cpp - ../clp/ReaderInterface.hpp - ../clp/TraceableException.hpp - ../clp/WriterInterface.cpp - ../clp/WriterInterface.hpp ../clp/cli_utils.cpp ../clp/cli_utils.hpp ../clp/database_utils.cpp ../clp/database_utils.hpp + ../clp/Defs.h + ../clp/ErrorCode.hpp ../clp/ffi/KeyValuePairLogEvent.cpp ../clp/ffi/KeyValuePairLogEvent.hpp ../clp/ffi/SchemaTree.cpp @@ -51,21 +31,41 @@ set( ../clp/ffi/ir_stream/utils.hpp ../clp/ffi/utils.cpp ../clp/ffi/utils.hpp + ../clp/FileDescriptor.cpp + ../clp/FileDescriptor.hpp + ../clp/GlobalMetadataDB.hpp + ../clp/GlobalMetadataDBConfig.cpp + ../clp/GlobalMetadataDBConfig.hpp + ../clp/GlobalMySQLMetadataDB.cpp + ../clp/GlobalMySQLMetadataDB.hpp ../clp/ir/EncodedTextAst.cpp ../clp/ir/EncodedTextAst.hpp ../clp/ir/parsing.cpp ../clp/ir/parsing.hpp ../clp/ir/types.hpp + ../clp/MySQLDB.cpp + ../clp/MySQLDB.hpp + ../clp/MySQLParamBindings.cpp + ../clp/MySQLParamBindings.hpp + ../clp/MySQLPreparedStatement.cpp + ../clp/MySQLPreparedStatement.hpp ../clp/networking/socket_utils.cpp ../clp/networking/socket_utils.hpp + ../clp/ReadOnlyMemoryMappedFile.cpp + ../clp/ReadOnlyMemoryMappedFile.hpp + ../clp/ReaderInterface.cpp + ../clp/ReaderInterface.hpp ../clp/streaming_archive/ArchiveMetadata.cpp ../clp/streaming_archive/ArchiveMetadata.hpp ../clp/streaming_compression/zstd/Decompressor.cpp ../clp/streaming_compression/zstd/Decompressor.hpp ../clp/time_types.hpp + ../clp/TraceableException.hpp ../clp/type_utils.hpp ../clp/utf8_utils.cpp ../clp/utf8_utils.hpp + ../clp/WriterInterface.cpp + ../clp/WriterInterface.hpp ) set( diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 530dad3fb..e4e8a837f 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -129,8 +129,8 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { case (char)Command::Compress: case (char)Command::Extract: case (char)Command::Search: - case (char)Command::Json_To_IR: - case (char)Command::IR_Compress: + case (char)Command::JsonToIr: + case (char)Command::IrCompress: m_command = (Command)command_input; break; default: @@ -270,7 +270,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { m_metadata_db_config = std::move(metadata_db_config); } - } else if (Command::IR_Compress == m_command) { + } else if (Command::IrCompress == m_command) { po::options_description compression_positional_options; // clang-format off compression_positional_options.add_options()( @@ -348,11 +348,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::notify(parsed_command_line_options); if (parsed_command_line_options.count("help")) { - print_IR_compression_usage(); + print_ir_compression_usage(); - std::cerr << "Examples:" << std::endl; - std::cerr << " # Compress file1.ir and dir1 into archives-dir" << std::endl; - std::cerr << " " << m_program_name << " i archives-dir file1.ir dir1" << std::endl; + std::cerr << "Examples:\n"; + std::cerr << " # Compress file1.ir and dir1 into archives-dir\n"; + std::cerr << " " << m_program_name << " i archives-dir file1.ir dir1\n"; po::options_description visible_options; visible_options.add(general_options); @@ -398,7 +398,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { m_metadata_db_config = std::move(metadata_db_config); } - } else if ((char)Command::Json_To_IR == command_input) { + } else if ((char)Command::JsonToIr == command_input) { po::options_description compression_positional_options; // clang-format off compression_positional_options.add_options()( @@ -466,11 +466,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::notify(parsed_command_line_options); if (parsed_command_line_options.count("help")) { - print_json_to_IR_usage(); + print_json_to_ir_usage(); - std::cerr << "Examples:" << std::endl; - std::cerr << " # Parse file1.json and dir1 into irs-dir" << std::endl; - std::cerr << " " << m_program_name << " r irs-dir file1.json dir1" << std::endl; + std::cerr << "Examples:\n"; + std::cerr << " # Parse file1.json and dir1 into irs-dir\n"; + std::cerr << " " << m_program_name << " r irs-dir file1.json dir1\n"; po::options_description visible_options; visible_options.add(general_options); @@ -1039,11 +1039,11 @@ void CommandLineArguments::print_search_usage() const { << std::endl; } -void CommandLineArguments::print_json_to_IR_usage() const { +void CommandLineArguments::print_json_to_ir_usage() const { std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl; } -void CommandLineArguments::print_IR_compression_usage() const { +void CommandLineArguments::print_ir_compression_usage() const { std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" << std::endl; } diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index dedd3bd59..48cdb47d1 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -27,8 +27,8 @@ class CommandLineArguments { Compress = 'c', Extract = 'x', Search = 's', - Json_To_IR = 'r', - IR_Compress = 'i' + JsonToIr = 'r', + IrCompress = 'i' }; enum class OutputHandlerType : uint8_t { @@ -161,9 +161,9 @@ class CommandLineArguments { void print_search_usage() const; - void print_json_to_IR_usage() const; + void print_json_to_ir_usage() const; - void print_IR_compression_usage() const; + void print_ir_compression_usage() const; // Variables std::string m_program_name; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 32e078f6f..0185f1305 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,15 +1,28 @@ #include "JsonParser.hpp" -#include +#include +#include +#include #include +#include #include #include -#include "archive_constants.hpp" +#include "../clp/ffi/SchemaTree.hpp" +#include "../clp/ffi/SchemaTreeNode.hpp" +#include "../clp/ffi/utils.hpp" +#include "../clp/ffi/Value.hpp" +#include "../clp/ir/types.hpp" +#include "../clp/streaming_compression/zstd/Decompressor.hpp" +#include "DictionaryWriter.hpp" #include "JsonFileIterator.hpp" +#include "ParsedMessage.hpp" + +using namespace simdjson; namespace clp_s { + JsonParser::JsonParser(JsonParserOption const& option) : m_num_messages(0), m_target_encoded_size(option.target_encoded_size), @@ -520,13 +533,13 @@ bool JsonParser::parse() { return true; } -NodeType get_archive_node_type( +auto JsonParser::get_archive_node_type( clp::ffi::SchemaTreeNode::Type ir_node_type, bool node_has_value, std::optional const& node_value -) { +) -> NodeType { // figure out what type the node is in archive node type - NodeType archive_node_type; + NodeType archive_node_type = NodeType::Unknown; switch (ir_node_type) { case clp::ffi::SchemaTreeNode::Type::Int: archive_node_type = NodeType::Integer; @@ -559,29 +572,33 @@ NodeType get_archive_node_type( } break; default: - archive_node_type = NodeType::Unknown; break; } return archive_node_type; } -// -int JsonParser::get_archive_node_id( - std::map, int32_t>& ir_node_to_archive_node_map, - int ir_node_id, +auto JsonParser::get_archive_node_id( + std::unordered_map>>& + ir_node_to_archive_node_unordered_map, + int32_t ir_node_id, NodeType archive_node_type, clp::ffi::SchemaTree const& ir_tree -) { - auto key = std::make_tuple(ir_node_id, archive_node_type); - auto map_location = ir_node_to_archive_node_map.find(key); - if (ir_node_to_archive_node_map.end() != map_location) { - return map_location->second; +) -> int { + auto unordered_map_location = ir_node_to_archive_node_unordered_map.find(ir_node_id); + if (ir_node_to_archive_node_unordered_map.end() != unordered_map_location) { + auto translation_vector = unordered_map_location->second; + for (int i = 0; i < translation_vector.size(); i++) { + if (translation_vector[i].first == archive_node_type) { + return translation_vector[i].second; + } + } } - auto& curr_node = ir_tree.get_node(ir_node_id); + + auto const& curr_node = ir_tree.get_node(ir_node_id); int32_t parent_node_id{-1}; if (ir_node_id != curr_node.get_parent_id()) { parent_node_id = get_archive_node_id( - ir_node_to_archive_node_map, + ir_node_to_archive_node_unordered_map, curr_node.get_parent_id(), NodeType::Object, ir_tree @@ -597,16 +614,23 @@ int JsonParser::get_archive_node_id( } int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); - ir_node_to_archive_node_map.emplace(std::move(key), curr_node_archive_id); + auto p = std::make_pair(archive_node_type, curr_node_archive_id); + if (ir_node_to_archive_node_unordered_map.end() != unordered_map_location) { + unordered_map_location->second.push_back(p); + } else { + std::vector> v; + v.push_back(p); + ir_node_to_archive_node_unordered_map.emplace(ir_node_id, v); + } return curr_node_archive_id; } void JsonParser::parse_kv_log_event( KeyValuePairLogEvent const& kv, - std::map, int32_t>& ir_node_to_archive_node_map + std::unordered_map>>& + ir_node_to_archive_node_unordered_map ) { clp::ffi::SchemaTree const& tree = kv.get_schema_tree(); - for (auto const& pair : kv.get_node_id_value_pairs()) { clp::ffi::SchemaTreeNode const& tree_node = tree.get_node(pair.first); clp::ffi::SchemaTreeNode::Type ir_node_type = tree_node.get_type(); @@ -621,7 +645,7 @@ void JsonParser::parse_kv_log_event( int node_id; try { node_id = get_archive_node_id( - ir_node_to_archive_node_map, + ir_node_to_archive_node_unordered_map, pair.first, archive_node_type, tree @@ -705,11 +729,11 @@ void JsonParser::parse_kv_log_event( int32_t current_schema_id = m_archive_writer->add_schema(m_current_schema); m_current_parsed_message.set_id(current_schema_id); m_archive_writer->append_message(current_schema_id, m_current_schema, m_current_parsed_message); - return; } -bool JsonParser::parse_from_IR() { - std::map, int32_t> ir_node_to_archive_node_map; +auto JsonParser::parse_from_ir() -> bool { + std::unordered_map>> + ir_node_to_archive_node_unordered_map; for (auto& file_path : m_file_paths) { int fsize = std::filesystem::file_size(file_path); @@ -742,7 +766,7 @@ bool JsonParser::parse_from_IR() { m_current_schema.clear(); auto const& kv_log_event = kv_log_event_result.value(); try { - parse_kv_log_event(kv_log_event, ir_node_to_archive_node_map); + parse_kv_log_event(kv_log_event, ir_node_to_archive_node_unordered_map); } catch (std::string msg) { SPDLOG_ERROR("ERROR: {}" + msg); zd.close(); @@ -754,14 +778,14 @@ bool JsonParser::parse_from_IR() { } m_num_messages++; if (m_archive_writer->get_data_size() >= m_target_encoded_size) { - ir_node_to_archive_node_map.clear(); + ir_node_to_archive_node_unordered_map.clear(); split_archive(); } m_current_parsed_message.clear(); } while (true); - ir_node_to_archive_node_map.clear(); + ir_node_to_archive_node_unordered_map.clear(); zd.close(); } return true; diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index b48d1bbcd..bd58869ed 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -1,45 +1,32 @@ #ifndef CLP_S_JSONPARSER_HPP #define CLP_S_JSONPARSER_HPP -#include +#include +#include #include +#include #include #include #include -#include #include "../clp/BufferReader.hpp" #include "../clp/ffi/ir_stream/Deserializer.hpp" #include "../clp/ffi/KeyValuePairLogEvent.hpp" #include "../clp/ffi/SchemaTree.hpp" #include "../clp/ffi/SchemaTreeNode.hpp" -#include "../clp/ffi/utils.hpp" #include "../clp/ffi/Value.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" -#include "../clp/ir/types.hpp" -#include "../clp/streaming_compression/zstd/Decompressor.hpp" #include "../clp/type_utils.hpp" #include "ArchiveWriter.hpp" -#include "DictionaryWriter.hpp" -#include "FileReader.hpp" -#include "FileWriter.hpp" #include "ParsedMessage.hpp" #include "Schema.hpp" -#include "SchemaMap.hpp" -#include "SchemaTree.hpp" -#include "SchemaWriter.hpp" -#include "TimestampDictionaryWriter.hpp" -#include "Utils.hpp" -#include "ZstdCompressor.hpp" using clp::BufferReader; using clp::ffi::ir_stream::Deserializer; using clp::ffi::KeyValuePairLogEvent; using clp::size_checked_pointer_cast; -using namespace simdjson; - namespace clp_s { struct JsonParserOption { std::vector file_paths; @@ -88,7 +75,7 @@ class JsonParser { * Parses the Key Value IR Stream and stores the data in the archive. * @return whether the IR Stream was parsed successfully */ - [[nodiscard]] bool parse_from_IR(); + [[nodiscard]] auto parse_from_ir() -> bool; /** * Writes the metadata and archive data to disk. @@ -106,29 +93,44 @@ class JsonParser { void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key); /** - * Parses a Key Value Log Event - * @param kv the key value log event - * @param cache cache of node id conversions between deserializer schema tree nodes and archive - * schema tree nodes + * Compresses the input files specified by the command line arguments into an archive. + * @param ir_node_type schema node type from the IR stream + * @param node_has_value Boolean that say whether or not the node has value. + * @param node_value The ir schema node value if the node has value + * @return The clp-s archive Node Type that shoudl be used for the archive node */ - void parse_kv_log_event( - KeyValuePairLogEvent const& kv, - std::map, int>& cache - ); + static auto get_archive_node_type( + clp::ffi::SchemaTreeNode::Type ir_node_type, + bool node_has_value, + std::optional const& node_value + ) -> NodeType; /** * Get archive node id for ir node - * @param cache cache of node id conversions between deserializer schema tree nodes and archive - * schema tree nodes + * @param ir_node_to_archive_node_unordered_map cache of node id conversions between + * deserializer schema tree nodes and archive schema tree nodes * @param irNodeID * @param irType * @param irTree */ - int get_archive_node_id( - std::map, int>& cache, - int irNodeID, - NodeType archiveNodeType, - clp::ffi::SchemaTree const& irTree + auto get_archive_node_id( + std::unordered_map>>& + ir_node_to_archive_node_unordered_map, + int32_t ir_node_id, + NodeType archive_node_type, + clp::ffi::SchemaTree const& ir_tree + ) -> int; + + /** + * Parses a Key Value Log Event + * @param kv the key value log event + * @param ir_node_to_archive_node_unordered_map cache of node id conversions between + * deserializer schema tree nodes and archive schema tree nodes + */ + void parse_kv_log_event( + KeyValuePairLogEvent const& kv, + std::unordered_map>>& + ir_node_to_archive_node_unordered_map ); /** diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 49bd148bb..7bdada0a3 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -1,7 +1,7 @@ +#include #include #include #include -#include #include #include #include @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -46,6 +47,9 @@ using clp_s::cEpochTimeMin; using clp_s::CommandLineArguments; namespace { + +size_t max_ir_buffer_size = 1'000'000'000; + /** * Compresses the input files specified by the command line arguments into an archive. * @param command_line_arguments @@ -53,12 +57,42 @@ namespace { */ bool compress(CommandLineArguments const& command_line_arguments); +template +auto flush_and_clear_serializer_buffer( + Serializer& serializer, + std::vector& byte_buf +) -> void; + +template +auto unpack_and_serialize_msgpack_bytes( + std::vector const& msgpack_bytes, + Serializer& serializer +) -> bool; + +/** + * Given user specified options and a file path to a JSON file calls the serailizer one each JSON + * entry to serialize into IR + * @param option + * @param path + * @return Whether serialization was successful + */ +template +auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path); + +/** + * Iterates over the input JSON files specified by the command line arguments to generate and IR + * file for each one. + * @param command_line_arguments + * @return Whether generation was successful + */ +auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool; + /** * Compresses the input IR files specified by the command line arguments into an archive. * @param command_line_arguments * @return Whether compression was successful */ -bool ir_compress(CommandLineArguments const& command_line_arguments); +auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool; /** * Decompresses the archive specified by the given JsonConstructorOption. @@ -160,7 +194,7 @@ auto unpack_and_serialize_msgpack_bytes( } template -auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { +auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) { auto result{Serializer::create()}; if (result.has_error()) { SPDLOG_ERROR("Failed to create Serializer"); @@ -187,14 +221,15 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { return false; } - std::string line; + std::string line = ""; size_t total_size = 0; if (in_file.is_open()) { while (getline(in_file, line)) { try { auto j_obj = nlohmann::json::parse(line); - if (!unpack_and_serialize_msgpack_bytes( + if (false + == unpack_and_serialize_msgpack_bytes( nlohmann::json::to_msgpack(j_obj), serializer )) @@ -203,7 +238,7 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { return false; } flush_and_clear_serializer_buffer(serializer, ir_buf); - if (ir_buf.size() >= 1'000'000'000) { + if (ir_buf.size() >= max_ir_buffer_size) { total_size = total_size + ir_buf.size(); zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); zc.flush(); @@ -229,7 +264,7 @@ auto run_serializer(clp_s::JsonToIRParserOption option, std::string path) { return true; } -bool generate_IR(CommandLineArguments const& command_line_arguments) { +auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool { auto irs_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); // Create output directory in case it doesn't exist @@ -269,7 +304,7 @@ bool generate_IR(CommandLineArguments const& command_line_arguments) { return true; } -bool ir_compress(CommandLineArguments const& command_line_arguments) { +auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool { auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); // Create output directory in case it doesn't exist @@ -307,7 +342,7 @@ bool ir_compress(CommandLineArguments const& command_line_arguments) { } clp_s::JsonParser parser(option); - if (false == parser.parse_from_IR()) { + if (false == parser.parse_from_ir()) { SPDLOG_ERROR("Encountered error while parsing input"); return false; } @@ -462,12 +497,12 @@ int main(int argc, char const* argv[]) { if (false == compress(command_line_arguments)) { return 1; } - } else if (CommandLineArguments::Command::IR_Compress == command_line_arguments.get_command()) { + } else if (CommandLineArguments::Command::IrCompress == command_line_arguments.get_command()) { if (false == ir_compress(command_line_arguments)) { return 1; } - } else if (CommandLineArguments::Command::Json_To_IR == command_line_arguments.get_command()) { - if (false == generate_IR(command_line_arguments)) { + } else if (CommandLineArguments::Command::JsonToIr == command_line_arguments.get_command()) { + if (false == generate_ir(command_line_arguments)) { return 1; } } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { From 96c3ef9eb2baac483ffadaca9ad2dc161aa364ef Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 7 Oct 2024 16:56:50 -0400 Subject: [PATCH 12/15] A bit of refactoring and corrections recommended by coderabitai --- .../core/src/clp_s/CommandLineArguments.cpp | 10 ++++-- .../core/src/clp_s/CommandLineArguments.hpp | 5 ++- components/core/src/clp_s/JsonParser.cpp | 1 + components/core/src/clp_s/JsonParser.hpp | 16 +++++---- components/core/src/clp_s/clp-s.cpp | 35 ++++++++++++++----- 5 files changed, 48 insertions(+), 19 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index e4e8a837f..6b6547fe6 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -426,6 +426,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::value(&m_max_document_size)->value_name("DOC_SIZE")-> default_value(m_max_document_size), "Maximum allowed size (B) for a single document before ir generation fails." + )( + "max-ir-buffer-size", + po::value(&m_max_ir_buffer_size)->value_name("BUFFER_SIZE")-> + default_value(m_max_ir_buffer_size), + "Maximum allowed size (B) for a in memory IR buffer befroe being written to file." )( "encoding-type", po::value(&m_encoding_type)->value_name("ENCODING_TYPE")-> @@ -1040,11 +1045,10 @@ void CommandLineArguments::print_search_usage() const { } void CommandLineArguments::print_json_to_ir_usage() const { - std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]" << std::endl; + std::cerr << "Usage: " << m_program_name << " r [OPTIONS] IRS_DIR [FILE/DIR ...]\n"; } void CommandLineArguments::print_ir_compression_usage() const { - std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" - << std::endl; + std::cerr << "Usage: " << m_program_name << " i [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]\n"; } } // namespace clp_s diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index 48cdb47d1..91e0eecb1 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -62,7 +62,9 @@ class CommandLineArguments { size_t get_max_document_size() const { return m_max_document_size; } - int get_encoding_type() const { return m_encoding_type; } + [[nodiscard]] auto get_max_ir_buffer_size() const -> size_t { return m_max_ir_buffer_size; } + + [[nodiscard]] auto get_encoding_type() const -> int { return m_encoding_type; } [[nodiscard]] bool print_archive_stats() const { return m_print_archive_stats; } @@ -182,6 +184,7 @@ class CommandLineArguments { bool m_ordered_decompression{false}; size_t m_ordered_chunk_size{0}; int m_encoding_type{8}; + size_t m_max_ir_buffer_size{512ULL * 1024 * 1024}; // Metadata db variables std::optional m_metadata_db_config; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 0185f1305..3caadcef8 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -18,6 +18,7 @@ #include "DictionaryWriter.hpp" #include "JsonFileIterator.hpp" #include "ParsedMessage.hpp" +#include "SchemaTree.hpp" using namespace simdjson; diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index bd58869ed..2629f3d0b 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -21,6 +21,7 @@ #include "ArchiveWriter.hpp" #include "ParsedMessage.hpp" #include "Schema.hpp" +#include "SchemaTree.hpp" using clp::BufferReader; using clp::ffi::ir_stream::Deserializer; @@ -44,6 +45,7 @@ struct JsonToIRParserOption { std::vector file_paths; std::string irs_dir; size_t max_document_size; + size_t max_ir_buffer_size; int compression_level; int encoding; }; @@ -93,11 +95,11 @@ class JsonParser { void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key); /** - * Compresses the input files specified by the command line arguments into an archive. + * Determines the archive node type based on the IR node type and value. * @param ir_node_type schema node type from the IR stream - * @param node_has_value Boolean that say whether or not the node has value. - * @param node_value The ir schema node value if the node has value - * @return The clp-s archive Node Type that shoudl be used for the archive node + * @param node_has_value Boolean that says whether or not the node has value. + * @param node_value The IR schema node value if the node has value + * @return The clp-s archive Node Type that should be used for the archive node */ static auto get_archive_node_type( clp::ffi::SchemaTreeNode::Type ir_node_type, @@ -109,9 +111,9 @@ class JsonParser { * Get archive node id for ir node * @param ir_node_to_archive_node_unordered_map cache of node id conversions between * deserializer schema tree nodes and archive schema tree nodes - * @param irNodeID - * @param irType - * @param irTree + * @param ir_node_id ID of the IR node + * @param archive_node_type Type of the archive node + * @param ir_treeThe IR schema tree */ auto get_archive_node_id( std::unordered_map>>& diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 7bdada0a3..13fbf7885 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -48,8 +48,6 @@ using clp_s::CommandLineArguments; namespace { -size_t max_ir_buffer_size = 1'000'000'000; - /** * Compresses the input files specified by the command line arguments into an archive. * @param command_line_arguments @@ -87,6 +85,17 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) */ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool; +/** + * Fill in JsonParserOption instance based on command line user input + * @param command_line_arguments + * @param option + * @return Whether setup was succesful + */ +auto setup_compression_options( + CommandLineArguments const& command_line_arguments, + clp_s::JsonParserOption& option +) -> bool; + /** * Compresses the input IR files specified by the command line arguments into an archive. * @param command_line_arguments @@ -238,7 +247,7 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) return false; } flush_and_clear_serializer_buffer(serializer, ir_buf); - if (ir_buf.size() >= max_ir_buffer_size) { + if (ir_buf.size() >= option.max_ir_buffer_size) { total_size = total_size + ir_buf.size(); zc.write(reinterpret_cast(ir_buf.data()), ir_buf.size()); zc.flush(); @@ -278,11 +287,13 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool { option.file_paths = command_line_arguments.get_file_paths(); option.irs_dir = irs_dir.string(); option.max_document_size = command_line_arguments.get_max_document_size(); + option.max_ir_buffer_size = command_line_arguments.get_max_ir_buffer_size(); option.compression_level = command_line_arguments.get_compression_level(); option.encoding = command_line_arguments.get_encoding_type(); if (false == clp_s::FileUtils::validate_path(option.file_paths)) { - exit(1); + SPDLOG_ERROR("Invalid file path(s) provided"); + return false; } std::vector all_file_paths; @@ -304,9 +315,11 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool { return true; } -auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool { +auto setup_compression_options( + CommandLineArguments const& command_line_arguments, + clp_s::JsonParserOption& option +) -> bool { auto archives_dir = std::filesystem::path(command_line_arguments.get_archives_dir()); - // Create output directory in case it doesn't exist try { std::filesystem::create_directory(archives_dir.string()); @@ -318,8 +331,6 @@ auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool { ); return false; } - - clp_s::JsonParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); @@ -340,6 +351,14 @@ auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool { db_config.get_metadata_table_prefix() ); } + return true; +} + +auto ir_compress(CommandLineArguments const& command_line_arguments) -> bool { + clp_s::JsonParserOption option{}; + if (false == setup_compression_options(command_line_arguments, option)) { + return false; + } clp_s::JsonParser parser(option); if (false == parser.parse_from_ir()) { From 923b6427760af5ca84dd2b9cb356adbbd656035d Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 7 Oct 2024 17:25:24 -0400 Subject: [PATCH 13/15] More graceful error handlign and some typo fixes --- components/core/src/clp_s/CommandLineArguments.cpp | 2 +- components/core/src/clp_s/JsonParser.cpp | 3 ++- components/core/src/clp_s/JsonParser.hpp | 2 +- components/core/src/clp_s/clp-s.cpp | 11 +++++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 6b6547fe6..0b31af3ab 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -430,7 +430,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "max-ir-buffer-size", po::value(&m_max_ir_buffer_size)->value_name("BUFFER_SIZE")-> default_value(m_max_ir_buffer_size), - "Maximum allowed size (B) for a in memory IR buffer befroe being written to file." + "Maximum allowed size (B) for an in memory IR buffer befroe being written to file." )( "encoding-type", po::value(&m_encoding_type)->value_name("ENCODING_TYPE")-> diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 3caadcef8..eef54019d 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -611,7 +611,7 @@ auto JsonParser::get_archive_node_id( if (validated_escaped_key.has_value()) { node_key = validated_escaped_key.value(); } else { - throw "Key is not utf8 compliant"; + throw "Key is not UTF-8 compliant"; } int curr_node_archive_id = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); @@ -747,6 +747,7 @@ auto JsonParser::parse_from_ir() -> bool { auto deserializer_result = Deserializer::create(zd); if (deserializer_result.has_error()) { + zd.close(); m_archive_writer->close(); return false; } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 2629f3d0b..1ce4ac83e 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -113,7 +113,7 @@ class JsonParser { * deserializer schema tree nodes and archive schema tree nodes * @param ir_node_id ID of the IR node * @param archive_node_type Type of the archive node - * @param ir_treeThe IR schema tree + * @param ir_tree The IR schema tree */ auto get_archive_node_id( std::unordered_map>>& diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 13fbf7885..8043890a5 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -227,6 +227,8 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) zc.open(out_file, option.compression_level); } catch (clp_s::ZstdCompressor::OperationFailed& error) { SPDLOG_ERROR("Failed to open ZSTDcompressor - {}", error.what()); + in_file.close(); + out_file.close(); return false; } @@ -244,6 +246,9 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) )) { SPDLOG_ERROR("Failed to serialize msgpack bytes for line: {}", line); + in_file.close(); + out_file.close(); + zc.close(); return false; } flush_and_clear_serializer_buffer(serializer, ir_buf); @@ -255,9 +260,15 @@ auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) } } catch (nlohmann::json::parse_error const& e) { SPDLOG_ERROR("JSON parsing error: {}", e.what()); + in_file.close(); + out_file.close(); + zc.close(); return false; } catch (std::exception const& e) { SPDLOG_ERROR("Error during serialization: {}", e.what()); + in_file.close(); + out_file.close(); + zc.close(); return false; } } From 36024d8c7c2a38ca48261e9d5e6f617d8eeea631 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 7 Oct 2024 17:43:22 -0400 Subject: [PATCH 14/15] Small acronym capitalization fix --- components/core/src/clp_s/JsonParser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 1ce4ac83e..0e08a39cb 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -41,7 +41,7 @@ struct JsonParserOption { std::shared_ptr metadata_db; }; -struct JsonToIRParserOption { +struct JsonToIrParserOption { std::vector file_paths; std::string irs_dir; size_t max_document_size; From f76863e12fc889b5d3100b1b9984ad2a81b0727e Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 7 Oct 2024 18:22:57 -0400 Subject: [PATCH 15/15] Fix broken struct name change propagation --- components/core/src/clp_s/JsonParser.hpp | 2 +- components/core/src/clp_s/clp-s.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 0e08a39cb..956373ae2 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -62,7 +62,7 @@ class JsonParser { // Constructor explicit JsonParser(JsonParserOption const& option); - JsonParser(JsonToIRParserOption const& option); + JsonParser(JsonToIrParserOption const& option); // Destructor ~JsonParser() = default; diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 8043890a5..554422adf 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -75,7 +75,7 @@ auto unpack_and_serialize_msgpack_bytes( * @return Whether serialization was successful */ template -auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path); +auto run_serializer(clp_s::JsonToIrParserOption const& option, std::string path); /** * Iterates over the input JSON files specified by the command line arguments to generate and IR @@ -203,7 +203,7 @@ auto unpack_and_serialize_msgpack_bytes( } template -auto run_serializer(clp_s::JsonToIRParserOption const& option, std::string path) { +auto run_serializer(clp_s::JsonToIrParserOption const& option, std::string path) { auto result{Serializer::create()}; if (result.has_error()) { SPDLOG_ERROR("Failed to create Serializer"); @@ -294,7 +294,7 @@ auto generate_ir(CommandLineArguments const& command_line_arguments) -> bool { SPDLOG_ERROR("Failed to create archives directory {} - {}", irs_dir.string(), e.what()); return false; } - clp_s::JsonToIRParserOption option{}; + clp_s::JsonToIrParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); option.irs_dir = irs_dir.string(); option.max_document_size = command_line_arguments.get_max_document_size();