diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 66901ae6c..eaebf7154 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -424,6 +424,8 @@ set(SOURCE_FILES_unitTest src/clp/Profiler.hpp src/clp/Query.cpp src/clp/Query.hpp + src/clp/QueryInterpretation.cpp + src/clp/QueryInterpretation.hpp src/clp/ReaderInterface.cpp src/clp/ReaderInterface.hpp src/clp/ReadOnlyMemoryMappedFile.cpp @@ -489,6 +491,8 @@ set(SOURCE_FILES_unitTest src/clp/VariableDictionaryWriter.cpp src/clp/VariableDictionaryWriter.hpp src/clp/version.hpp + src/clp/WildcardExpression.cpp + src/clp/WildcardExpression.hpp src/clp/WriterInterface.cpp src/clp/WriterInterface.hpp submodules/sqlite3/sqlite3.c diff --git a/components/core/src/clp/Grep.cpp b/components/core/src/clp/Grep.cpp index c59e21ca1..c1e4a4e9f 100644 --- a/components/core/src/clp/Grep.cpp +++ b/components/core/src/clp/Grep.cpp @@ -1,16 +1,16 @@ #include "Grep.hpp" #include +#include -#include +#include +#include #include #include "EncodedVariableInterpreter.hpp" #include "ir/parsing.hpp" #include "ir/types.hpp" -#include "LogSurgeonReader.hpp" #include "StringReader.hpp" -#include "Utils.hpp" using clp::ir::is_delim; using clp::streaming_archive::reader::Archive; @@ -20,7 +20,20 @@ using clp::string_utils::clean_up_wildcard_search_string; using clp::string_utils::is_alphabet; using clp::string_utils::is_wildcard; using clp::string_utils::wildcard_match_unsafe; +using log_surgeon::finite_automata::RegexDFA; +using log_surgeon::finite_automata::RegexDFAByteState; +using log_surgeon::finite_automata::RegexNFA; +using log_surgeon::finite_automata::RegexNFAByteState; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::ParserAST; +using log_surgeon::SchemaAST; +using log_surgeon::SchemaVarAST; +using std::set; using std::string; +using std::string_view; +using std::tuple; +using std::unique_ptr; +using std::variant; using std::vector; namespace clp { @@ -251,15 +264,6 @@ bool QueryToken::change_to_next_possible_type() { } } -/** - * Wraps the tokens returned from the log_surgeon lexer, and stores the variable ids of the tokens - * in a search query in a set. This allows for optimized search performance. - */ -class SearchToken : public log_surgeon::Token { -public: - std::set m_type_ids_set; -}; - // Local prototypes /** * Process a QueryToken that is definitely a variable @@ -277,6 +281,7 @@ bool process_var_token( SubQuery& sub_query, string& logtype ); + /** * Finds a message matching the given query * @param query @@ -417,7 +422,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( size_t last_token_end_pos = 0; string logtype; auto escape_handler - = [](std::string_view constant, size_t char_to_escape_pos, string& logtype) -> void { + = [](string_view constant, size_t char_to_escape_pos, string& logtype) -> void { auto const escape_char{enum_to_underlying_type(ir::VariablePlaceholder::Escape)}; auto const next_char_pos{char_to_escape_pos + 1}; // NOTE: We don't want to add additional escapes for wildcards that have been escaped. E.g., @@ -432,7 +437,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( for (auto const& query_token : query_tokens) { // Append from end of last token to beginning of this token, to logtype ir::append_constant_to_logtype( - static_cast(processed_search_string) + static_cast(processed_search_string) .substr(last_token_end_pos, query_token.get_begin_pos() - last_token_end_pos), escape_handler, @@ -466,7 +471,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( if (last_token_end_pos < processed_search_string.length()) { // Append from end of last token to end ir::append_constant_to_logtype( - static_cast(processed_search_string) + static_cast(processed_search_string) .substr(last_token_end_pos, string::npos), escape_handler, logtype @@ -502,8 +507,7 @@ std::optional Grep::process_raw_query( epochtime_t search_begin_ts, epochtime_t search_end_ts, bool ignore_case, - log_surgeon::lexers::ByteLexer& forward_lexer, - log_surgeon::lexers::ByteLexer& reverse_lexer, + ByteLexer& lexer, bool use_heuristic ) { // Add prefix and suffix '*' to make the search a sub-string match @@ -512,13 +516,14 @@ std::optional Grep::process_raw_query( processed_search_string += '*'; processed_search_string = clean_up_wildcard_search_string(processed_search_string); - // Split search_string into tokens with wildcards - vector query_tokens; - size_t begin_pos = 0; - size_t end_pos = 0; - bool is_var; - string search_string_for_sub_queries{processed_search_string}; + vector sub_queries; if (use_heuristic) { + // Split search_string into tokens with wildcards + vector query_tokens; + size_t begin_pos = 0; + size_t end_pos = 0; + bool is_var; + string search_string_for_sub_queries{processed_search_string}; // Replace '?' wildcards with '*' wildcards since we currently have no support for // generating sub-queries with '?' wildcards. The final wildcard match on the decompressed // message uses the original wildcards, so correctness will be maintained. @@ -540,78 +545,87 @@ std::optional Grep::process_raw_query( { query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); } - } else { - while (get_bounds_of_next_potential_var( - search_string_for_sub_queries, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - )) - { - query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); - } - } - - // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since we - // fall-back to decompression + wildcard matching for those. - vector ambiguous_tokens; - for (auto& query_token : query_tokens) { - if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { - ambiguous_tokens.push_back(&query_token); + // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since + // we fall-back to decompression + wildcard matching for those. + vector ambiguous_tokens; + for (auto& query_token : query_tokens) { + if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { + ambiguous_tokens.push_back(&query_token); + } } - } - - // Generate a sub-query for each combination of ambiguous tokens - // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we need - // to create: - // - (token1 as logtype) (token2 as logtype) - // - (token1 as logtype) (token2 as var) - // - (token1 as var) (token2 as logtype) - // - (token1 as var) (token2 as var) - vector sub_queries; - string logtype; - bool type_of_one_token_changed = true; - while (type_of_one_token_changed) { - SubQuery sub_query; - // Compute logtypes and variables for query - auto matchability = generate_logtypes_and_vars_for_subquery( - archive, - search_string_for_sub_queries, - query_tokens, - ignore_case, - sub_query - ); - switch (matchability) { - case SubQueryMatchabilityResult::SupercedesAllSubQueries: - // Since other sub-queries will be superceded by this one, we can stop processing - // now - return Query{ - search_begin_ts, - search_end_ts, - ignore_case, - processed_search_string, - {} - }; - case SubQueryMatchabilityResult::MayMatch: - sub_queries.push_back(std::move(sub_query)); - break; - case SubQueryMatchabilityResult::WontMatch: - default: - // Do nothing - break; - } + // Generate a sub-query for each combination of ambiguous tokens + // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we + // need to create: + // - (token1 as logtype) (token2 as logtype) + // - (token1 as logtype) (token2 as var) + // - (token1 as var) (token2 as logtype) + // - (token1 as var) (token2 as var) + string logtype; + bool type_of_one_token_changed = true; + while (type_of_one_token_changed) { + SubQuery sub_query; + + // Compute logtypes and variables for query + auto matchability = generate_logtypes_and_vars_for_subquery( + archive, + search_string_for_sub_queries, + query_tokens, + ignore_case, + sub_query + ); + switch (matchability) { + case SubQueryMatchabilityResult::SupercedesAllSubQueries: + // Since other sub-queries will be superceded by this one, we can stop + // processing now + return Query{ + search_begin_ts, + search_end_ts, + ignore_case, + processed_search_string, + {} + }; + case SubQueryMatchabilityResult::MayMatch: + sub_queries.push_back(std::move(sub_query)); + break; + case SubQueryMatchabilityResult::WontMatch: + default: + // Do nothing + break; + } - // Update combination of ambiguous tokens - type_of_one_token_changed = false; - for (auto* ambiguous_token : ambiguous_tokens) { - if (ambiguous_token->change_to_next_possible_type()) { - type_of_one_token_changed = true; - break; + // Update combination of ambiguous tokens + type_of_one_token_changed = false; + for (auto* ambiguous_token : ambiguous_tokens) { + if (ambiguous_token->change_to_next_possible_type()) { + type_of_one_token_changed = true; + break; + } } } + } else { + // Use the schema dynamic programming approach to perform the search. This iteratively + // creates all possible logtypes that can match substring(0,n) of the query, which includes + // all possible logtypes that can match the query itself. Then these logtypes, and their + // corresponding variables are compared against the archive. + WildcardExpression search_string_for_sub_queries{processed_search_string}; + + // Get the possible logtypes for the query (but only do it once across all archives). + static bool query_substr_interpretations_is_set = false; + static set query_interpretations; + // TODO: until we have per schema logic, we need to do everything for every archive, + // but this only needs to be redone if the schema changes. + constexpr bool execute_for_every_archive = true; + if (execute_for_every_archive || false == query_substr_interpretations_is_set) { + query_interpretations.clear(); + query_interpretations = generate_query_substring_interpretations( + search_string_for_sub_queries, + lexer + ); + query_substr_interpretations_is_set = true; + } + // Use the logtypes to determine all subqueries that may match against the current archive. + generate_sub_queries(query_interpretations, archive, lexer, ignore_case, sub_queries); } if (sub_queries.empty()) { @@ -713,7 +727,7 @@ bool Grep::get_bounds_of_next_potential_var( // - it could be a multi-digit hex value, or // - it's directly preceded by an equals sign and contains an alphabet without a wildcard // between the equals sign and the first alphabet of the token - auto variable = static_cast(value).substr(begin_pos, end_pos - begin_pos); + auto variable = static_cast(value).substr(begin_pos, end_pos - begin_pos); if (contains_decimal_digit || ir::could_be_multi_digit_hex_value(variable)) { is_var = true; } else if (begin_pos > 0 && '=' == value[begin_pos - 1] && contains_alphabet) { @@ -747,149 +761,6 @@ bool Grep::get_bounds_of_next_potential_var( return (value_length != begin_pos); } -bool Grep::get_bounds_of_next_potential_var( - string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& forward_lexer, - log_surgeon::lexers::ByteLexer& reverse_lexer -) { - size_t const value_length = value.length(); - if (end_pos >= value_length) { - return false; - } - - is_var = false; - bool contains_wildcard = false; - while (false == is_var && false == contains_wildcard && begin_pos < value_length) { - // Start search at end of last token - begin_pos = end_pos; - - // Find variable begin or wildcard - bool is_escaped = false; - for (; begin_pos < value_length; ++begin_pos) { - char c = value[begin_pos]; - - if (is_escaped) { - is_escaped = false; - - if (false == forward_lexer.is_delimiter(c)) { - // Found escaped non-delimiter, so reverse the index to retain the escape - // character - --begin_pos; - break; - } - } else if ('\\' == c) { - // Escape character - is_escaped = true; - } else { - if (is_wildcard(c)) { - contains_wildcard = true; - break; - } - if (false == forward_lexer.is_delimiter(c)) { - break; - } - } - } - - // Find next delimiter - is_escaped = false; - end_pos = begin_pos; - for (; end_pos < value_length; ++end_pos) { - char c = value[end_pos]; - - if (is_escaped) { - is_escaped = false; - - if (forward_lexer.is_delimiter(c)) { - // Found escaped delimiter, so reverse the index to retain the escape character - --end_pos; - break; - } - } else if ('\\' == c) { - // Escape character - is_escaped = true; - } else { - if (is_wildcard(c)) { - contains_wildcard = true; - } else if (forward_lexer.is_delimiter(c)) { - // Found delimiter that's not also a wildcard - break; - } - } - } - - if (end_pos > begin_pos) { - bool has_prefix_wildcard = ('*' == value[begin_pos]) || ('?' == value[begin_pos]); - bool has_suffix_wildcard = ('*' == value[end_pos - 1]) || ('?' == value[begin_pos]); - bool has_wildcard_in_middle = false; - for (size_t i = begin_pos + 1; i < end_pos - 1; ++i) { - if (('*' == value[i] || '?' == value[i]) && value[i - 1] != '\\') { - has_wildcard_in_middle = true; - break; - } - } - SearchToken search_token; - if (has_wildcard_in_middle || (has_prefix_wildcard && has_suffix_wildcard)) { - // DO NOTHING - } else { - StringReader string_reader; - LogSurgeonReader reader_wrapper(string_reader); - log_surgeon::ParserInputBuffer parser_input_buffer; - if (has_suffix_wildcard) { // text* - // TODO: creating a string reader, setting it equal to a string, to read it into - // the ParserInputBuffer, seems like a convoluted way to set a string equal to a - // string, should be improved when adding a SearchParser to log_surgeon - string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); - parser_input_buffer.read_if_safe(reader_wrapper); - forward_lexer.reset(); - forward_lexer.scan_with_wildcard( - parser_input_buffer, - value[end_pos - 1], - search_token - ); - } else if (has_prefix_wildcard) { // *text - std::string value_reverse - = value.substr(begin_pos + 1, end_pos - begin_pos - 1); - std::reverse(value_reverse.begin(), value_reverse.end()); - string_reader.open(value_reverse); - parser_input_buffer.read_if_safe(reader_wrapper); - reverse_lexer.reset(); - reverse_lexer.scan_with_wildcard( - parser_input_buffer, - value[begin_pos], - search_token - ); - } else { // no wildcards - string_reader.open(value.substr(begin_pos, end_pos - begin_pos)); - parser_input_buffer.read_if_safe(reader_wrapper); - forward_lexer.reset(); - forward_lexer.scan(parser_input_buffer, search_token); - search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); - } - // TODO: use a set so its faster - // auto const& set = search_token.m_type_ids_set; - // if (set.find(static_cast(log_surgeon::SymbolID::TokenUncaughtStringID)) - // == set.end() - // && set.find(static_cast(log_surgeon::SymbolID::TokenEndID)) - // == set.end()) - // { - // is_var = true; - // } - auto const& type = search_token.m_type_ids_ptr->at(0); - if (type != static_cast(log_surgeon::SymbolID::TokenUncaughtStringID) - && type != static_cast(log_surgeon::SymbolID::TokenEndID)) - { - is_var = true; - } - } - } - } - return (value_length != begin_pos); -} - void Grep::calculate_sub_queries_relevant_to_file( File const& compressed_file, vector& queries @@ -1063,4 +934,365 @@ size_t Grep::search(Query const& query, size_t limit, Archive& archive, File& co return num_matches; } + +set Grep::generate_query_substring_interpretations( + WildcardExpression const& processed_search_string, + ByteLexer& lexer +) { + // Store substring logtypes in a set to avoid duplicates + vector> query_substr_interpretations(processed_search_string.length()); + + // Consider each substr(begin_idx,end_idx) of the processed_search_string and determine if it + // could have been compressed as static-text, a variable, or some combination of + // variables/static-text Then we populate each entry in query_substr_interpretations which + // corresponds to the logtype for substr(0,n). To do this, for each combination of + // substr(begin_idx,end_idx) that reconstructs substr(0,n) (e.g., substring "*1 34", can be + // reconstructed from substrings "*1", " ", "34"), store all possible logtypes (e.g. "* + // , "* , etc.) that are unique from any previously checked combination. Each + // entry in query_substr_interpretations is used to build the following entry, with the last + // entry having all possible logtypes for the full query itself. + for (size_t end_idx = 1; end_idx <= processed_search_string.length(); ++end_idx) { + // Skip strings that end with an escape character (e.g., substring " text\" from string + // "* text\* *"). + if (processed_search_string.char_is_escape(end_idx - 1)) { + continue; + } + for (size_t begin_idx = 0; begin_idx < end_idx; ++begin_idx) { + // Skip strings that begin with an incorrectly unescaped wildcard (e.g., substring + // "*text" from string "* \*text *"). + if (begin_idx > 0 && processed_search_string.char_is_escape(begin_idx - 1)) { + continue; + } + auto possible_substr_types = get_interpretations_for_whole_wildcard_expr( + WildcardExpressionView{processed_search_string, begin_idx, end_idx}, + lexer + ); + if (possible_substr_types.empty()) { + continue; + } + + // Use the completed set of variable types for each substr(begin_idx,end_idx) to + // construct all possible logtypes for each substr(0,n), for all n. + if (begin_idx > 0) { + // Handle the case where substr(0,n) is composed of multiple + // substr(begin_idx,end_idx). + for (auto const& prefix : query_substr_interpretations[begin_idx - 1]) { + for (auto& suffix : possible_substr_types) { + QueryInterpretation query_interpretation = prefix; + query_interpretation.append_logtype(suffix); + + if (false + == query_substr_interpretations[end_idx - 1].contains( + query_interpretation + )) + { + // For the interpretations of the query itself we need the logtype + // strings + // TODO: this is doing 2^n the work for cases with encoded variables + if (end_idx == processed_search_string.length()) { + query_interpretation.generate_logtype_string(lexer); + } + + query_substr_interpretations[end_idx - 1].insert(query_interpretation); + } + } + } + } else { + // Handle the case where substr(0,n) == substr(begin_idx,end_idx). + while (false == possible_substr_types.empty()) { + auto possible_substr_type{std::move(possible_substr_types.back())}; + possible_substr_types.pop_back(); + + if (false + == query_substr_interpretations[end_idx - 1].contains(possible_substr_type)) + { + // For the interpretations of the query itself we need the logtype strings + // TODO: this is doing 2^n the work for cases with encoded variables + if (end_idx == processed_search_string.length()) { + possible_substr_type.generate_logtype_string(lexer); + } + + query_substr_interpretations[end_idx - 1].insert(possible_substr_type); + } + } + } + } + } + // The last entry of the query_substr_interpretations is the logtypes for the query itself. + return query_substr_interpretations.back(); +} + +vector Grep::get_interpretations_for_whole_wildcard_expr( + WildcardExpressionView const& wildcard_expr, + ByteLexer& lexer +) { + vector interpretations; + + // Don't allow an isolated greedy wildcard to be considered a variable + if (wildcard_expr.is_greedy_wildcard()) { + interpretations.emplace_back("*"); + return interpretations; + } + + // As we extend substrings adjacent to wildcards, the substrings that begin or end with + // wildcards are redundant (e.g., for string "a*b", a decomposition of the form "a*" + "b" is a + // subset of the more general "a*" + "*" + "*b". Note, as this needs "*", the "*" substring is + // not redundant. This is already handled above). More detail about this is given below. + if (wildcard_expr.starts_or_ends_with_greedy_wildcard()) { + return interpretations; + } + + if (false == wildcard_expr.surrounded_by_delims_or_wildcards(lexer)) { + // Variables must be surrounded by delimiters or wildcards, so this wildcard expression can + // only match static text. + interpretations.emplace_back(wildcard_expr.get_value()); + return interpretations; + } + + // If the substring is preceded or proceeded by a greedy wildcard then it's possible the + // substring could be extended to match a var, so the wildcards are added to the substring. + // If we don't consider this case we could miss combinations. Take for example "a*b", "a*" + // and "*b" can both match a has# style variable ("\w*\d+\w*"). If we decompose the string + // into either substrings "a*" + "b" or "a" + "*b", neither would capture the possibility of + // a logtype with the form "*", which is a valid possibility during compression. + // Instead we desire to decompose the string into "a*" + "*" + "*b". Note, non-greedy + // wildcards do not need to be considered, for example "a?b" can never match "?" + // or "". + auto extended_wildcard_expr = wildcard_expr.extend_to_adjacent_greedy_wildcards(); + + set matching_variable_type_ids; + // If the substring contains a wildcard, we need to consider the case that it can simultaneously + // match multiple variables and static text, and we need a different approach to compare against + // the archive. + bool contains_wildcard = false; + std::tie(matching_variable_type_ids, contains_wildcard) + = get_matching_variable_types(extended_wildcard_expr, lexer); + if (matching_variable_type_ids.empty() || contains_wildcard) { + // The wildcard expression doesn't match any variable types, or it contains a wildcard, so + // we must consider that it could match static text. + interpretations.emplace_back(wildcard_expr.get_value()); + } + + bool already_added_dict_var = false; + // Use the variable types to determine the possible_substr_types + for (uint32_t const variable_type_id : matching_variable_type_ids) { + // clp supports three types of variables---int encoded variables, float encoded variables, + // and dictionary variables---whereas log-surgeon (in combination with the schema file) can + // support more, meaning we need to somehow project the variable types found by log-surgeon + // (schema variables) to the variable types that clp supports (clp variables). At present, + // clp's encoded variables have a one-to-one mapping since a variable will only be encoded + // if it's named `QueryInterpretation::cIntVarName` or `QueryInterpretation::cFloatVarName`. + // Thus, any other schema variables need to be treated as clp dictionary variables. + // + // TODO We shouldn't hardcode the type names for encoded variables, but to support that, we + // need to improve our schema file syntax. + auto& variable_type_name = lexer.m_id_symbol[variable_type_id]; + auto is_encoded_variable_type = QueryInterpretation::cIntVarName == variable_type_name + || QueryInterpretation::cFloatVarName == variable_type_name; + if (false == is_encoded_variable_type) { + if (already_added_dict_var) { + // The current variable type is not an encoded variable, so it should be treated as + // a dictionary variable; but we've already added a dictionary variable to the + // current `QueryInterpretation`, so adding another would result in a duplicate + // interpretation. + continue; + } + already_added_dict_var = true; + } else { + if (contains_wildcard) { + // Since the wildcard expression matches one of the encodable variable types and + // contains a wildcard, we need to consider two cases: + // - It could match an encoded variable. + // - It could match a dictionary variable that is the result of failing to encode + // a variable, where that variable seems encodable (e.g., an integer that's too + // large to be encoded). + // On the default code path, we create a query interpretation that interprets the + // expression as a dictionary variable, so here we add another interpretation that + // interprets the expression as an encoded variable. + interpretations.emplace_back( + variable_type_id, + extended_wildcard_expr.get_value(), + contains_wildcard, + true + ); + } + } + interpretations.emplace_back( + variable_type_id, + extended_wildcard_expr.get_value(), + contains_wildcard, + false + ); + + // If the substring has no wildcards, we can safely exclude lower priority variable + // types. + if (false == contains_wildcard) { + break; + } + } + + return interpretations; +} + +/** + * To determine what variable types the wildcard expression could match, we convert the expression + * into a DFA (wildcard expression -> regex -> NFA -> DFA) and compute its intersection with the + * schema's DFA. + */ +tuple, bool> Grep::get_matching_variable_types( + WildcardExpressionView const& wildcard_expr, + ByteLexer const& lexer +) { + // Convert the wildcard expression into an equivalent regex + string regex_search_string; + bool contains_wildcard = false; + for (uint32_t idx = 0; idx < wildcard_expr.length(); idx++) { + if (wildcard_expr.char_is_escape(idx)) { + continue; + } + + auto const c = wildcard_expr.get_char(idx); + if (wildcard_expr.char_is_greedy_wildcard(idx)) { + contains_wildcard = true; + regex_search_string += ".*"; + } else if (wildcard_expr.char_is_non_greedy_wildcard(idx)) { + contains_wildcard = true; + regex_search_string += "."; + } else if (log_surgeon::SchemaParser::get_special_regex_characters().contains(c)) { + regex_search_string += "\\"; + regex_search_string += c; + } else { + regex_search_string += c; + } + } + + // Convert regex to NFA + log_surgeon::Schema substring_schema; + // TODO: log-surgeon should handle resetting this value. + log_surgeon::NonTerminal::m_next_children_start = 0; + // TODO: Optimize NFA creation. + substring_schema.add_variable("search", regex_search_string, -1); + RegexNFA nfa; + auto schema_ast = substring_schema.release_schema_ast_ptr(); + for (auto const& parser_ast : schema_ast->m_schema_vars) { + auto* schema_var_ast = dynamic_cast(parser_ast.get()); + ByteLexer::Rule const rule{0, std::move(schema_var_ast->m_regex_ptr)}; + rule.add_ast(&nfa); + } + + // Convert NFA to DFA + // TODO: Refactor log-surgeon to allow direct usage of DFA/NFA. + // TODO: Optimize DFA creation. + auto const search_string_dfa = ByteLexer::nfa_to_dfa(nfa); + auto const& schema_dfa = lexer.get_dfa(); + + // TODO: Could use a forward/reverse lexer instead of an intersection in a lot of cases. + auto var_types = schema_dfa->get_intersect(search_string_dfa); + return {var_types, contains_wildcard}; +} + +void Grep::generate_sub_queries( + set const& query_interpretations, + Archive const& archive, + ByteLexer& lexer, + bool const ignore_case, + vector& sub_queries +) { + for (auto const& query_interpretation : query_interpretations) { + auto const& logtype_string = query_interpretation.get_logtype_string(); + // Check if the logtype string exists in the logtype dictionary. If not, then this logtype + // string does not form a useful sub query. + std::unordered_set possible_logtype_entries; + archive.get_logtype_dictionary().get_entries_matching_wildcard_string( + logtype_string, + ignore_case, + possible_logtype_entries + ); + if (possible_logtype_entries.empty()) { + continue; + } + + // Check if the variables associated with the logtype string exist in the variable + // dictionary. If not, then this does not form a useful sub query. If the variable is + // encoded in the segment, we just assume it exists in the segment, as we estimate that + // checking is slower than decompressing. + SubQuery sub_query; + bool has_vars = true; + for (uint32_t i = 0; i < query_interpretation.get_logtype_size(); i++) { + if (auto const& logtype_token = query_interpretation.get_logtype_token(i); + std::holds_alternative(logtype_token)) + { + auto const& variable_token = std::get(logtype_token); + auto const variable_type = variable_token.get_variable_type(); + auto const& raw_string = variable_token.get_query_substring(); + auto const is_encoded_with_wildcard = variable_token.get_is_encoded_with_wildcard(); + auto const var_has_wildcard = variable_token.get_has_wildcard(); + auto& schema_type = lexer.m_id_symbol[variable_type]; + encoded_variable_t encoded_var; + if (is_encoded_with_wildcard) { + sub_query.mark_wildcard_match_required(); + } else if (false == var_has_wildcard + && ((schema_type == QueryInterpretation::cIntVarName + && EncodedVariableInterpreter:: + convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + || (schema_type == QueryInterpretation::cFloatVarName + && EncodedVariableInterpreter:: + convert_string_to_representable_float_var( + raw_string, + encoded_var + )))) + { + sub_query.add_non_dict_var(encoded_var); + } else { + auto& var_dict = archive.get_var_dictionary(); + if (var_has_wildcard) { + // Find matches + std::unordered_set var_dict_entries; + var_dict.get_entries_matching_wildcard_string( + raw_string, + ignore_case, + var_dict_entries + ); + if (var_dict_entries.empty()) { + // Not in dictionary + has_vars = false; + } else { + // Encode matches + std::unordered_set encoded_vars; + for (auto entry : var_dict_entries) { + encoded_vars.insert(EncodedVariableInterpreter::encode_var_dict_id( + entry->get_id() + )); + } + sub_query.add_imprecise_dict_var(encoded_vars, var_dict_entries); + } + } else { + auto entry = var_dict.get_entry_matching_value(raw_string, ignore_case); + if (nullptr == entry) { + // Not in dictionary + has_vars = false; + } else { + encoded_var + = EncodedVariableInterpreter::encode_var_dict_id(entry->get_id() + ); + sub_query.add_dict_var(encoded_var, entry); + } + } + } + } + } + if (false == has_vars) { + continue; + } + sub_query.set_possible_logtypes(possible_logtype_entries); + + // Calculate the IDs of the segments that may contain results for the sub-query now that + // we've calculated the matching logtypes and variables + sub_query.calculate_ids_of_matching_segments(); + sub_queries.push_back(std::move(sub_query)); + } +} } // namespace clp diff --git a/components/core/src/clp/Grep.hpp b/components/core/src/clp/Grep.hpp index f520af212..f832b58ca 100644 --- a/components/core/src/clp/Grep.hpp +++ b/components/core/src/clp/Grep.hpp @@ -8,10 +8,13 @@ #include "Defs.h" #include "Query.hpp" +#include "QueryInterpretation.hpp" #include "streaming_archive/reader/Archive.hpp" #include "streaming_archive/reader/File.hpp" +#include "WildcardExpression.hpp" namespace clp { + class Grep { public: // Types @@ -37,8 +40,7 @@ class Grep { * @param search_begin_ts * @param search_end_ts * @param ignore_case - * @param forward_lexer DFA for determining if input is in the schema - * @param reverse_lexer DFA for determining if reverse of input is in the schema + * @param lexer DFA for determining if input is in the schema * @param use_heuristic * @return Query if it may match a message, std::nullopt otherwise */ @@ -48,8 +50,7 @@ class Grep { epochtime_t search_begin_ts, epochtime_t search_end_ts, bool ignore_case, - log_surgeon::lexers::ByteLexer& forward_lexer, - log_surgeon::lexers::ByteLexer& reverse_lexer, + log_surgeon::lexers::ByteLexer& lexer, bool use_heuristic ); @@ -69,25 +70,6 @@ class Grep { bool& is_var ); - /** - * Returns bounds of next potential variable (either a definite variable or a token with - * wildcards) - * @param value String containing token - * @param begin_pos Begin position of last token, changes to begin position of next token - * @param end_pos End position of last token, changes to end position of next token - * @param is_var Whether the token is definitely a variable - * @param forward_lexer DFA for determining if input is in the schema - * @param reverse_lexer DFA for determining if reverse of input is in the schema - * @return true if another potential variable was found, false otherwise - */ - static bool get_bounds_of_next_potential_var( - std::string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& forward_lexer, - log_surgeon::lexers::ByteLexer& reverse_lexer - ); /** * Marks which sub-queries in each query are relevant to the given file * @param compressed_file @@ -126,6 +108,7 @@ class Grep { streaming_archive::reader::Message& compressed_msg, std::string& decompressed_msg ); + /** * Searches a file with the given query without outputting the results * @param query @@ -143,6 +126,63 @@ class Grep { streaming_archive::reader::Archive& archive, streaming_archive::reader::File& compressed_file ); + + /** + * Generates all possible logtypes that can match each substr(0,n) of the search string. + * Requires that processed_search_string is valid, meaning that only wildcards are escaped + * and the string does not end with an escape character. + * @param processed_search_string + * @param lexer + * @return a vector of all QueryInterpretations that can match the query in + * processed_search_string. + */ + static std::set generate_query_substring_interpretations( + WildcardExpression const& processed_search_string, + log_surgeon::lexers::ByteLexer& lexer + ); + + /** + * Computes the tokens (static text or different types of variables) that the given wildcard + * expression (as a whole) could be interpreted as, generates a `QueryInterpretation` for each + * one, and returns the `QueryInterpretation`s. + * @param wildcard_expr + * @param lexer + * @return The `QueryInterpretation`s. + */ + static std::vector get_interpretations_for_whole_wildcard_expr( + WildcardExpressionView const& wildcard_expr, + log_surgeon::lexers::ByteLexer& lexer + ); + + /** + * Gets the variable types that the given wildcard expression could match. + * @param wildcard_expr + * @param lexer + * @return A tuple: + * - The set of variable types that the wildcard expression could match. + * - Whether the wildcard expression contains a wildcard. + */ + static std::tuple, bool> get_matching_variable_types( + WildcardExpressionView const& wildcard_expr, + log_surgeon::lexers::ByteLexer const& lexer + ); + + /** + * Compare all possible query logtypes against the archive to determine all possible sub queries + * that can match against messages in the archive. + * @param query_interpretations + * @param archive + * @param lexer + * @param ignore_case + * @param sub_queries + */ + static void generate_sub_queries( + std::set const& query_interpretations, + streaming_archive::reader::Archive const& archive, + log_surgeon::lexers::ByteLexer& lexer, + bool ignore_case, + std::vector& sub_queries + ); }; } // namespace clp diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index e38ec9efb..2f429987c 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -135,6 +135,7 @@ class SubQuery { * @return true if matched, false otherwise */ bool matches_logtype(logtype_dictionary_id_t logtype) const; + /** * Whether the given variables contain the subquery's variables in order (but not necessarily * contiguously) diff --git a/components/core/src/clp/QueryInterpretation.cpp b/components/core/src/clp/QueryInterpretation.cpp new file mode 100644 index 000000000..6aa24fdc8 --- /dev/null +++ b/components/core/src/clp/QueryInterpretation.cpp @@ -0,0 +1,195 @@ +#include "QueryInterpretation.hpp" + +#include +#include +#include +#include +#include +#include + +#include "Defs.h" +#include "EncodedVariableInterpreter.hpp" +#include "log_surgeon/Lexer.hpp" +#include "LogTypeDictionaryEntry.hpp" +#include "string_utils/string_utils.hpp" + +using log_surgeon::lexers::ByteLexer; +using std::string; + +namespace clp { +auto VariableQueryToken::operator<(VariableQueryToken const& rhs) const -> bool { + if (m_variable_type < rhs.m_variable_type) { + return true; + } + if (m_variable_type > rhs.m_variable_type) { + return false; + } + if (m_query_substring < rhs.m_query_substring) { + return true; + } + if (m_query_substring > rhs.m_query_substring) { + return false; + } + if (m_has_wildcard != rhs.m_has_wildcard) { + return rhs.m_has_wildcard; + } + if (m_is_encoded != rhs.m_is_encoded) { + return rhs.m_is_encoded; + } + return false; +} + +auto VariableQueryToken::operator>(VariableQueryToken const& rhs) const -> bool { + if (m_variable_type > rhs.m_variable_type) { + return true; + } + if (m_variable_type < rhs.m_variable_type) { + return false; + } + if (m_query_substring > rhs.m_query_substring) { + return true; + } + if (m_query_substring < rhs.m_query_substring) { + return false; + } + if (m_has_wildcard != rhs.m_has_wildcard) { + return m_has_wildcard; + } + if (m_is_encoded != rhs.m_is_encoded) { + return m_is_encoded; + } + return false; +} + +void QueryInterpretation::append_logtype(QueryInterpretation& suffix) { + auto const& first_new_token = suffix.m_logtype[0]; + if (auto& prev_token = m_logtype.back(); + false == m_logtype.empty() && std::holds_alternative(prev_token) + && false == suffix.m_logtype.empty() + && std::holds_alternative(first_new_token)) + { + std::get(prev_token).append(std::get(first_new_token)); + m_logtype.insert(m_logtype.end(), suffix.m_logtype.begin() + 1, suffix.m_logtype.end()); + } else { + m_logtype.insert(m_logtype.end(), suffix.m_logtype.begin(), suffix.m_logtype.end()); + } +} + +void QueryInterpretation::generate_logtype_string(ByteLexer& lexer) { + // Convert each query logtype into a set of logtype strings. Logtype strings are used in the + // sub query as they have the correct format for comparing against the archive. Also, a + // single query logtype might represent multiple logtype strings. While static text converts + // one-to-one, wildcard variables that may be encoded have different logtype strings when + // comparing against the dictionary than they do when comparing against the segment. + + // Reserve size for m_logtype_string + uint32_t logtype_string_size = 0; + for (uint32_t i = 0; i < get_logtype_size(); i++) { + if (auto const& logtype_token = get_logtype_token(i); + std::holds_alternative(logtype_token)) + { + logtype_string_size + += std::get(logtype_token).get_query_substring().size(); + } else { + logtype_string_size++; + } + } + m_logtype_string.reserve(logtype_string_size); + + for (uint32_t i = 0; i < get_logtype_size(); i++) { + if (auto const& logtype_token = get_logtype_token(i); + std::holds_alternative(logtype_token)) + { + m_logtype_string += std::get(logtype_token).get_query_substring(); + } else { + auto const& variable_token = std::get(logtype_token); + auto const variable_type = variable_token.get_variable_type(); + auto const& raw_string = variable_token.get_query_substring(); + auto const is_encoded_with_wildcard = variable_token.get_is_encoded_with_wildcard(); + auto const var_has_wildcard = variable_token.get_has_wildcard(); + auto& schema_type = lexer.m_id_symbol[variable_type]; + encoded_variable_t encoded_var = 0; + if (is_encoded_with_wildcard) { + if (cIntVarName == schema_type) { + LogTypeDictionaryEntry::add_int_var(m_logtype_string); + } else if (cFloatVarName == schema_type) { + LogTypeDictionaryEntry::add_float_var(m_logtype_string); + } + } else if (false == var_has_wildcard && cIntVarName == schema_type + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + { + LogTypeDictionaryEntry::add_int_var(m_logtype_string); + } else if (false == var_has_wildcard && cFloatVarName == schema_type + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + )) + { + LogTypeDictionaryEntry::add_float_var(m_logtype_string); + } else { + LogTypeDictionaryEntry::add_dict_var(m_logtype_string); + } + } + } +} + +auto QueryInterpretation::operator<(QueryInterpretation const& rhs) const -> bool { + if (m_logtype.size() < rhs.m_logtype.size()) { + return true; + } + if (m_logtype.size() > rhs.m_logtype.size()) { + return false; + } + for (uint32_t i = 0; i < m_logtype.size(); i++) { + if (m_logtype[i] < rhs.m_logtype[i]) { + return true; + } + if (m_logtype[i] > rhs.m_logtype[i]) { + return false; + } + } + return false; +} + +auto operator<<(std::ostream& os, QueryInterpretation const& query_logtype) -> std::ostream& { + os << "logtype='"; + for (uint32_t idx = 0; idx < query_logtype.get_logtype_size(); idx++) { + if (auto const& query_token = query_logtype.get_logtype_token(idx); + std::holds_alternative(query_token)) + { + os << std::get(query_token).get_query_substring(); + } else { + auto const& variable_token = std::get(query_token); + os << "<" << variable_token.get_variable_type() << ">(" + << variable_token.get_query_substring() << ")"; + } + } + os << "', has_wildcard='"; + for (uint32_t idx = 0; idx < query_logtype.get_logtype_size(); idx++) { + if (auto const& query_token = query_logtype.get_logtype_token(idx); + std::holds_alternative(query_token)) + { + os << 0; + } else { + auto const& variable_token = std::get(query_token); + os << variable_token.get_has_wildcard(); + } + } + os << "', is_encoded_with_wildcard='"; + for (uint32_t idx = 0; idx < query_logtype.get_logtype_size(); idx++) { + if (auto const& query_token = query_logtype.get_logtype_token(idx); + std::holds_alternative(query_token)) + { + os << 0; + } else { + auto const& variable_token = std::get(query_token); + os << variable_token.get_is_encoded_with_wildcard(); + } + } + os << "', logtype_string='" << query_logtype.get_logtype_string() << "'"; + return os; +} +} // namespace clp diff --git a/components/core/src/clp/QueryInterpretation.hpp b/components/core/src/clp/QueryInterpretation.hpp new file mode 100644 index 000000000..3f8f4fdac --- /dev/null +++ b/components/core/src/clp/QueryInterpretation.hpp @@ -0,0 +1,204 @@ +#ifndef CLP_GREP_QUERY_INTERPRETATION_HPP +#define CLP_GREP_QUERY_INTERPRETATION_HPP + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace clp { +/** + * Represents a static substring in the query string as a token. + */ +class StaticQueryToken { +public: + explicit StaticQueryToken(std::string query_substring) + : m_query_substring(std::move(query_substring)) {} + + auto operator==(StaticQueryToken const& rhs) const -> bool = default; + + auto operator!=(StaticQueryToken const& rhs) const -> bool = default; + + auto operator<(StaticQueryToken const& rhs) const -> bool { + return m_query_substring < rhs.m_query_substring; + } + + auto operator>(StaticQueryToken const& rhs) const -> bool { + return m_query_substring > rhs.m_query_substring; + } + + auto append(StaticQueryToken const& rhs) -> void { + m_query_substring += rhs.get_query_substring(); + } + + [[nodiscard]] auto get_query_substring() const -> std::string const& { + return m_query_substring; + } + +private: + std::string m_query_substring; +}; + +/** + * Represents variable substring in the query string as a token. + */ +class VariableQueryToken { +public: + VariableQueryToken( + uint32_t const variable_type, + std::string query_substring, + bool const has_wildcard, + bool const is_encoded + ) + : m_variable_type(variable_type), + m_query_substring(std::move(query_substring)), + m_has_wildcard(has_wildcard), + m_is_encoded(is_encoded) {} + + auto operator==(VariableQueryToken const& rhs) const -> bool = default; + + auto operator!=(VariableQueryToken const& rhs) const -> bool = default; + + auto operator<(VariableQueryToken const& rhs) const -> bool; + + auto operator>(VariableQueryToken const& rhs) const -> bool; + + [[nodiscard]] auto get_variable_type() const -> uint32_t { return m_variable_type; } + + [[nodiscard]] auto get_query_substring() const -> std::string const& { + return m_query_substring; + } + + [[nodiscard]] auto get_has_wildcard() const -> bool { return m_has_wildcard; } + + [[nodiscard]] auto get_is_encoded_with_wildcard() const -> bool { + return m_is_encoded && m_has_wildcard; + } + +private: + uint32_t m_variable_type; + std::string m_query_substring; + bool m_has_wildcard{false}; + bool m_is_encoded{false}; +}; + +/** + * Represents a logtype that would match the given search query. The logtype is a sequence + * containing values, where each value is either a static character or an integer representing + * a variable type id. Also indicates if an integer/float variable is potentially in the dictionary + * to handle cases containing wildcards. Note: long float and integers that cannot be encoded do not + * fall under this case, as they are not potentially, but definitely in the dictionary, so will be + * searched for in the dictionary regardless. + */ +class QueryInterpretation { +public: + QueryInterpretation() = default; + + explicit QueryInterpretation(std::string const& query_substring) { + append_static_token(query_substring); + } + + QueryInterpretation( + uint32_t const variable_type, + std::string query_substring, + bool const contains_wildcard, + bool const is_encoded + ) { + append_variable_token( + variable_type, + std::move(query_substring), + contains_wildcard, + is_encoded + ); + } + + /** + * Ignores m_logtype_string. + * @param rhs + * @return if m_logtype is equal + */ + auto operator==(QueryInterpretation const& rhs) const -> bool { + return m_logtype == rhs.m_logtype; + } + + /** + * @param rhs + * @return true if the current logtype is shorter than rhs, false if the current logtype + * is longer. If equally long, true if the current logtype is lexicographically smaller than + * rhs, false if bigger. If the logtypes are identical, true if the current search query is + * lexicographically smaller than rhs, false if bigger. If the search queries are identical, + * true if the first mismatch in special character locations is a non-special character for the + * current logtype, false otherwise. Ignores m_logtype_string. + */ + auto operator<(QueryInterpretation const& rhs) const -> bool; + + auto clear() -> void { + m_logtype.clear(); + m_logtype_string = ""; + } + + auto append_logtype(QueryInterpretation& suffix) -> void; + + auto append_static_token(std::string const& query_substring) -> void { + StaticQueryToken static_query_token(query_substring); + if (auto& prev_token = m_logtype.back(); + false == m_logtype.empty() && std::holds_alternative(prev_token)) + { + std::get(prev_token).append(static_query_token); + } else { + m_logtype.emplace_back(static_query_token); + } + } + + auto append_variable_token( + uint32_t const variable_type, + std::string query_substring, + bool const contains_wildcard, + bool const is_encoded + ) -> void { + m_logtype.emplace_back(VariableQueryToken( + variable_type, + std::move(query_substring), + contains_wildcard, + is_encoded + )); + } + + /** + * Generates the logtype string to compare against the logtype dictionary in the archive. + * @param lexer + */ + auto generate_logtype_string(log_surgeon::lexers::ByteLexer& lexer) -> void; + + [[nodiscard]] auto get_logtype_size() const -> uint32_t { return m_logtype.size(); } + + [[nodiscard]] auto get_logtype_token(uint32_t const i + ) const -> std::variant const& { + return m_logtype[i]; + } + + [[nodiscard]] auto get_logtype_string() const -> std::string const& { return m_logtype_string; } + + static constexpr std::string_view cIntVarName = "int"; + static constexpr std::string_view cFloatVarName = "float"; + +private: + std::vector> m_logtype; + std::string m_logtype_string; +}; + +/** + * Convert input query logtype to string for output + * @param os + * @param query_logtype + * @return output stream with the query logtype + */ +auto operator<<(std::ostream& os, QueryInterpretation const& query_logtype) -> std::ostream&; +} // namespace clp + +#endif // CLP_GREP_QUERY_INTERPRETATION_HPP diff --git a/components/core/src/clp/StringReader.cpp b/components/core/src/clp/StringReader.cpp index 9fa2c27d3..716a400d1 100644 --- a/components/core/src/clp/StringReader.cpp +++ b/components/core/src/clp/StringReader.cpp @@ -14,45 +14,44 @@ using std::string; namespace clp { StringReader::~StringReader() { close(); - free(m_getdelim_buf); } ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (input_string.empty()) { + if (m_input_string.empty()) { return ErrorCode_NotInit; } if (nullptr == buf) { return ErrorCode_BadParam; } - if (pos == input_string.size()) { + if (m_pos == m_input_string.size()) { return ErrorCode_EndOfFile; } - if (pos + num_bytes_to_read > input_string.size()) { - num_bytes_to_read = input_string.size() - pos; + if (m_pos + num_bytes_to_read > m_input_string.size()) { + num_bytes_to_read = m_input_string.size() - m_pos; } for (int i = 0; i < num_bytes_to_read; i++) { - buf[i] = input_string[i + pos]; + buf[i] = m_input_string[i + m_pos]; } num_bytes_read = num_bytes_to_read; - pos += num_bytes_read; + m_pos += num_bytes_read; return ErrorCode_Success; } ErrorCode StringReader::try_seek_from_begin(size_t pos) { - this->pos = pos; + m_pos = pos; return ErrorCode_Success; } ErrorCode StringReader::try_get_pos(size_t& pos) { - pos = this->pos; + pos = m_pos; return ErrorCode_Success; } ErrorCode StringReader::try_open(string const& input_string) { - this->input_string = input_string; - string_is_set = true; + m_input_string = input_string; + m_string_is_set = true; return ErrorCode_Success; } @@ -60,5 +59,9 @@ void StringReader::open(string const& input_string) { try_open(input_string); } -void StringReader::close() {} +void StringReader::close() { + m_input_string.clear(); + m_string_is_set = false; + m_pos = 0; +} } // namespace clp diff --git a/components/core/src/clp/StringReader.hpp b/components/core/src/clp/StringReader.hpp index 5f3c4a73d..160580d4c 100644 --- a/components/core/src/clp/StringReader.hpp +++ b/components/core/src/clp/StringReader.hpp @@ -23,7 +23,7 @@ class StringReader : public ReaderInterface { char const* what() const noexcept override { return "StringReader operation failed"; } }; - StringReader() : pos(0), m_getdelim_buf_len(0), m_getdelim_buf(nullptr), string_is_set(false) {} + StringReader() = default; ~StringReader(); @@ -59,7 +59,7 @@ class StringReader : public ReaderInterface { ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; // Methods - bool is_open() const { return string_is_set; } + bool is_open() const { return m_string_is_set; } /** * Tries to open a file @@ -79,18 +79,11 @@ class StringReader : public ReaderInterface { * Closes the file if it's open */ void close(); - /** - * Tries to stat the current file - * @param stat_buffer - * @return ErrorCode_errno on error - * @return ErrorCode_Success on success - */ + private: - size_t m_getdelim_buf_len; - char* m_getdelim_buf; - std::string input_string; - uint32_t pos; - bool string_is_set; + std::string m_input_string; + uint32_t m_pos{0}; + bool m_string_is_set{false}; }; } // namespace clp diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index f487a3880..65c1c88ef 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -176,7 +176,6 @@ void load_lexer_from_file( bool reverse, log_surgeon::lexers::ByteLexer& lexer ) { - log_surgeon::SchemaParser sp; std::unique_ptr schema_ast = log_surgeon::SchemaParser::try_schema_file(schema_file_path); if (!lexer.m_symbol_id.empty()) { @@ -240,10 +239,6 @@ void load_lexer_from_file( for (std::unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); - if ("timestamp" == rule->m_name) { - continue; - } - if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) { lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; @@ -264,7 +259,7 @@ void load_lexer_from_file( } } - if (contains_delimiter) { + if (contains_delimiter && "timestamp" != rule->m_name) { FileReader schema_reader{schema_ast->m_file_path}; // more detailed debugging based on looking at the file string line; diff --git a/components/core/src/clp/WildcardExpression.cpp b/components/core/src/clp/WildcardExpression.cpp new file mode 100644 index 000000000..85092b9ee --- /dev/null +++ b/components/core/src/clp/WildcardExpression.cpp @@ -0,0 +1,112 @@ +#include "WildcardExpression.hpp" + +#include +#include +#include +#include + +#include +#include + +namespace clp { +WildcardExpression::WildcardExpression(std::string processed_search_string) + : m_processed_search_string(std::move(processed_search_string)) { + m_is_greedy_wildcard.reserve(m_processed_search_string.size()); + m_is_non_greedy_wildcard.reserve(m_processed_search_string.size()); + m_is_escape.reserve(m_processed_search_string.size()); + bool is_escaped = false; + for (auto const& c : m_processed_search_string) { + if (is_escaped) { + m_is_greedy_wildcard.push_back(false); + m_is_non_greedy_wildcard.push_back(false); + m_is_escape.push_back(false); + is_escaped = false; + } else { + if ('\\' == c) { + m_is_greedy_wildcard.push_back(false); + m_is_non_greedy_wildcard.push_back(false); + m_is_escape.push_back(true); + is_escaped = true; + } else if ('*' == c) { + m_is_greedy_wildcard.push_back(true); + m_is_non_greedy_wildcard.push_back(false); + m_is_escape.push_back(false); + } else if ('?' == c) { + m_is_greedy_wildcard.push_back(false); + m_is_non_greedy_wildcard.push_back(true); + m_is_escape.push_back(false); + } else { + m_is_greedy_wildcard.push_back(false); + m_is_non_greedy_wildcard.push_back(false); + m_is_escape.push_back(false); + } + } + } +} + +WildcardExpressionView::WildcardExpressionView( + WildcardExpression const& wildcard_expression, + size_t const begin_idx, + size_t const end_idx +) + : m_expression{&wildcard_expression}, + m_begin_idx{begin_idx}, + m_end_idx{end_idx} { + m_end_idx = std::min(m_end_idx, wildcard_expression.length()); + m_begin_idx = std::min(m_begin_idx, m_end_idx); +} + +auto WildcardExpressionView::extend_to_adjacent_greedy_wildcards() const -> WildcardExpressionView { + auto extended_view = *this; + bool const prev_char_is_greedy_wildcard + = m_begin_idx > 0 && m_expression->char_is_greedy_wildcard(m_begin_idx - 1); + if (prev_char_is_greedy_wildcard) { + --extended_view.m_begin_idx; + } + bool const next_char_is_greedy_wildcard = m_end_idx < m_expression->length() + && m_expression->char_is_greedy_wildcard(m_end_idx); + if (next_char_is_greedy_wildcard) { + ++extended_view.m_end_idx; + } + return extended_view; +} + +auto WildcardExpressionView::surrounded_by_delims_or_wildcards( + log_surgeon::lexers::ByteLexer const& lexer +) const -> bool { + bool has_preceding_delim{}; + if (0 == m_begin_idx) { + has_preceding_delim = true; + } else { + bool const preceded_by_greedy_wildcard + = m_expression->char_is_greedy_wildcard(m_begin_idx - 1); + bool const preceded_by_non_greedy_wildcard + = m_expression->char_is_non_greedy_wildcard(m_begin_idx - 1); + bool const preceded_by_delimiter + = lexer.is_delimiter(m_expression->get_char(m_begin_idx - 1)); + has_preceding_delim = preceded_by_greedy_wildcard || preceded_by_non_greedy_wildcard + || preceded_by_delimiter; + } + + bool has_succeeding_delim{}; + if (m_expression->length() == m_end_idx) { + has_succeeding_delim = true; + } else { + bool const succeeded_by_greedy_wildcard = m_expression->char_is_greedy_wildcard(m_end_idx); + bool const succeeded_by_non_greedy_wildcard + = m_expression->char_is_non_greedy_wildcard(m_end_idx); + // E.g. "foo:", where ':' is a delimiter + bool const succeeded_by_unescaped_delim + = false == m_expression->char_is_escape(m_end_idx) + && lexer.is_delimiter(m_expression->get_char(m_end_idx)); + // E.g. "foo\\", where '\' is a delimiter + bool const succeeded_by_escaped_delim + = m_expression->char_is_escape(m_end_idx) + && lexer.is_delimiter(m_expression->get_char(m_end_idx + 1)); + has_succeeding_delim = succeeded_by_greedy_wildcard || succeeded_by_non_greedy_wildcard + || succeeded_by_unescaped_delim || succeeded_by_escaped_delim; + } + + return has_preceding_delim && has_succeeding_delim; +} +} // namespace clp diff --git a/components/core/src/clp/WildcardExpression.hpp b/components/core/src/clp/WildcardExpression.hpp new file mode 100644 index 000000000..c3de2e43b --- /dev/null +++ b/components/core/src/clp/WildcardExpression.hpp @@ -0,0 +1,125 @@ +#ifndef CLP_WILDCARDEXPRESSION_HPP +#define CLP_WILDCARDEXPRESSION_HPP + +#include +#include +#include + +#include + +namespace clp { +/** + * A pattern for matching strings. The pattern supports two types of wildcards: + * - '*' matches zero or more characters + * - '?' matches any single character + * + * To match a literal '*' or '?', the pattern should escape it with a backslash (`\`). + */ +class WildcardExpression { +public: + explicit WildcardExpression(std::string processed_search_string); + + [[nodiscard]] auto substr(size_t const begin_idx, size_t const length) const -> std::string { + return m_processed_search_string.substr(begin_idx, length); + } + + [[nodiscard]] auto length() const -> size_t { return m_processed_search_string.size(); } + + [[nodiscard]] auto char_is_greedy_wildcard(size_t const idx) const -> bool { + return m_is_greedy_wildcard[idx]; + } + + [[nodiscard]] auto char_is_non_greedy_wildcard(size_t const idx) const -> bool { + return m_is_non_greedy_wildcard[idx]; + } + + [[nodiscard]] auto char_is_escape(size_t const idx) const -> bool { return m_is_escape[idx]; } + + [[nodiscard]] auto get_char(size_t const idx) const -> char { + return m_processed_search_string[idx]; + } + +private: + std::vector m_is_greedy_wildcard; + std::vector m_is_non_greedy_wildcard; + std::vector m_is_escape; + std::string m_processed_search_string; +}; + +/** + * A view of a WildcardExpression. + */ +class WildcardExpressionView { +public: + /** + * Creates a view of the range [begin_idx, end_idx) in the given wildcard expression. + * + * NOTE: To ensure validity, end_idx is limited to wildcard_expression.length(), and then + * begin_idx is limited to end_idx. + * @param wildcard_expression + * @param begin_idx + * @param end_idx + */ + WildcardExpressionView( + WildcardExpression const& wildcard_expression, + size_t begin_idx, + size_t end_idx + ); + + /** + * @return A copy of this view, but extended to include adjacent greedy wildcards. + */ + [[nodiscard]] auto extend_to_adjacent_greedy_wildcards() const -> WildcardExpressionView; + + [[nodiscard]] auto is_greedy_wildcard() const -> bool { + return 1 == length() && m_expression->char_is_greedy_wildcard(m_begin_idx); + } + + [[nodiscard]] auto is_non_greedy_wildcard() const -> bool { + return 1 == length() && m_expression->char_is_non_greedy_wildcard(m_begin_idx); + } + + [[nodiscard]] auto starts_or_ends_with_greedy_wildcard() const -> bool { + return length() > 0 + && (m_expression->char_is_greedy_wildcard(m_begin_idx) + || m_expression->char_is_greedy_wildcard(m_end_idx - 1)); + } + + /** + * @param lexer + * @return Whether the substring in view is surrounded by delimiters or unescaped wildcards. + * NOTE: This method assumes that the viewed string is preceded and succeeded by a delimiter. + */ + [[nodiscard]] auto surrounded_by_delims_or_wildcards(log_surgeon::lexers::ByteLexer const& lexer + ) const -> bool; + + [[nodiscard]] auto length() const -> size_t { return m_end_idx - m_begin_idx; } + + [[nodiscard]] auto char_is_greedy_wildcard(size_t const idx) const -> bool { + return m_expression->char_is_greedy_wildcard(m_begin_idx + idx); + } + + [[nodiscard]] auto char_is_non_greedy_wildcard(size_t const idx) const -> bool { + return m_expression->char_is_non_greedy_wildcard(m_begin_idx + idx); + } + + [[nodiscard]] auto char_is_escape(size_t const idx) const -> bool { + return m_expression->char_is_escape(m_begin_idx + idx); + } + + [[nodiscard]] auto get_char(size_t const idx) const -> char { + return m_expression->get_char(m_begin_idx + idx); + } + + [[nodiscard]] auto get_value() const -> std::string { + return m_expression->substr(m_begin_idx, m_end_idx - m_begin_idx); + } + +private: + WildcardExpression const* m_expression; + size_t m_begin_idx; + size_t m_end_idx; +}; +} // namespace clp + +#endif // CLP_WILDCARDEXPRESSION_HPP diff --git a/components/core/src/clp/clg/CMakeLists.txt b/components/core/src/clp/clg/CMakeLists.txt index a0ca5e9d0..1498fa5f5 100644 --- a/components/core/src/clp/clg/CMakeLists.txt +++ b/components/core/src/clp/clg/CMakeLists.txt @@ -59,6 +59,8 @@ set( ../Profiler.hpp ../Query.cpp ../Query.hpp + ../QueryInterpretation.cpp + ../QueryInterpretation.hpp ../ReaderInterface.cpp ../ReaderInterface.hpp ../ReadOnlyMemoryMappedFile.cpp @@ -115,6 +117,8 @@ set( ../VariableDictionaryWriter.cpp ../VariableDictionaryWriter.hpp ../version.hpp + ../WildcardExpression.cpp + ../WildcardExpression.hpp ../WriterInterface.cpp ../WriterInterface.hpp "${PROJECT_SOURCE_DIR}/submodules/sqlite3/sqlite3.c" diff --git a/components/core/src/clp/clg/clg.cpp b/components/core/src/clp/clg/clg.cpp index dd35a3283..0ce526b4f 100644 --- a/components/core/src/clp/clg/clg.cpp +++ b/components/core/src/clp/clg/clg.cpp @@ -53,12 +53,15 @@ static bool open_archive(string const& archive_path, Archive& archive_reader); * @param search_strings * @param command_line_args * @param archive + * @param lexer + * @param use_heuristic * @return true on success, false otherwise */ static bool search( vector const& search_strings, CommandLineArguments& command_line_args, Archive& archive, + log_surgeon::lexers::ByteLexer& lexer, bool use_heuristic ); /** @@ -205,8 +208,7 @@ static bool search( vector const& search_strings, CommandLineArguments& command_line_args, Archive& archive, - log_surgeon::lexers::ByteLexer& forward_lexer, - log_surgeon::lexers::ByteLexer& reverse_lexer, + log_surgeon::lexers::ByteLexer& lexer, bool use_heuristic ) { ErrorCode error_code; @@ -225,8 +227,7 @@ static bool search( search_begin_ts, search_end_ts, command_line_args.ignore_case(), - forward_lexer, - reverse_lexer, + lexer, use_heuristic ); if (query_processing_result.has_value()) { @@ -545,12 +546,9 @@ int main(int argc, char const* argv[]) { // TODO: if performance is too slow, can make this more efficient by only diffing files with the // same checksum uint32_t const max_map_schema_length = 100'000; - std::map forward_lexer_map; - std::map reverse_lexer_map; - log_surgeon::lexers::ByteLexer one_time_use_forward_lexer; - log_surgeon::lexers::ByteLexer one_time_use_reverse_lexer; - log_surgeon::lexers::ByteLexer* forward_lexer_ptr; - log_surgeon::lexers::ByteLexer* reverse_lexer_ptr; + std::map lexer_map; + log_surgeon::lexers::ByteLexer one_time_use_lexer; + log_surgeon::lexers::ByteLexer* lexer_ptr{nullptr}; string archive_id; Archive archive_reader; @@ -592,46 +590,27 @@ int main(int argc, char const* argv[]) { size_t num_bytes_read; file_reader.read(buf, max_map_schema_length, num_bytes_read); if (num_bytes_read < max_map_schema_length) { - auto forward_lexer_map_it = forward_lexer_map.find(buf); - auto reverse_lexer_map_it = reverse_lexer_map.find(buf); + auto lexer_map_it = lexer_map.find(buf); // if there is a chance there might be a difference make a new lexer as it's pretty // fast to create - if (forward_lexer_map_it == forward_lexer_map.end()) { + if (lexer_map_it == lexer_map.end()) { // Create forward lexer - auto insert_result - = forward_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); - forward_lexer_ptr = &insert_result.first->second; - load_lexer_from_file(schema_file_path, false, *forward_lexer_ptr); - - // Create reverse lexer - insert_result - = reverse_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); - reverse_lexer_ptr = &insert_result.first->second; - load_lexer_from_file(schema_file_path, true, *reverse_lexer_ptr); + auto insert_result = lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); + lexer_ptr = &insert_result.first->second; + load_lexer_from_file(schema_file_path, false, *lexer_ptr); } else { - // load the lexers if they already exist - forward_lexer_ptr = &forward_lexer_map_it->second; - reverse_lexer_ptr = &reverse_lexer_map_it->second; + // load the lexer if it already exists + lexer_ptr = &lexer_map_it->second; } } else { - // Create forward lexer - forward_lexer_ptr = &one_time_use_forward_lexer; - load_lexer_from_file(schema_file_path, false, one_time_use_forward_lexer); - - // Create reverse lexer - reverse_lexer_ptr = &one_time_use_reverse_lexer; - load_lexer_from_file(schema_file_path, false, one_time_use_reverse_lexer); + // Create lexer + lexer_ptr = &one_time_use_lexer; + load_lexer_from_file(schema_file_path, false, one_time_use_lexer); } } // Perform search - if (!search(search_strings, - command_line_args, - archive_reader, - *forward_lexer_ptr, - *reverse_lexer_ptr, - use_heuristic)) - { + if (!search(search_strings, command_line_args, archive_reader, *lexer_ptr, use_heuristic)) { return -1; } archive_reader.close(); diff --git a/components/core/src/clp/clo/CMakeLists.txt b/components/core/src/clp/clo/CMakeLists.txt index 931bffeaf..ce814e8d4 100644 --- a/components/core/src/clp/clo/CMakeLists.txt +++ b/components/core/src/clp/clo/CMakeLists.txt @@ -59,6 +59,8 @@ set( ../Profiler.hpp ../Query.cpp ../Query.hpp + ../QueryInterpretation.cpp + ../QueryInterpretation.hpp ../ReaderInterface.cpp ../ReaderInterface.hpp ../ReadOnlyMemoryMappedFile.cpp @@ -117,6 +119,8 @@ set( ../VariableDictionaryWriter.cpp ../VariableDictionaryWriter.hpp ../version.hpp + ../WildcardExpression.cpp + ../WildcardExpression.hpp ../WriterInterface.cpp ../WriterInterface.hpp "${PROJECT_SOURCE_DIR}/submodules/sqlite3/sqlite3.c" diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index f29df0306..07c29c308 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -467,17 +467,13 @@ static bool search_archive( // Load lexers from schema file if it exists auto schema_file_path = archive_path / clp::streaming_archive::cSchemaFileName; - unique_ptr forward_lexer, reverse_lexer; + unique_ptr lexer; bool use_heuristic = true; if (std::filesystem::exists(schema_file_path)) { use_heuristic = false; // Create forward lexer - forward_lexer.reset(new log_surgeon::lexers::ByteLexer()); - load_lexer_from_file(schema_file_path.string(), false, *forward_lexer); - - // Create reverse lexer - reverse_lexer.reset(new log_surgeon::lexers::ByteLexer()); - load_lexer_from_file(schema_file_path.string(), true, *reverse_lexer); + lexer = std::make_unique(); + load_lexer_from_file(schema_file_path.string(), false, *lexer); } Archive archive_reader; @@ -493,8 +489,7 @@ static bool search_archive( search_begin_ts, search_end_ts, command_line_args.ignore_case(), - *forward_lexer, - *reverse_lexer, + *lexer, use_heuristic ); if (false == query_processing_result.has_value()) { diff --git a/components/core/src/clp/clp/FileCompressor.hpp b/components/core/src/clp/clp/FileCompressor.hpp index b8b6c55fd..47a46550c 100644 --- a/components/core/src/clp/clp/FileCompressor.hpp +++ b/components/core/src/clp/clp/FileCompressor.hpp @@ -38,6 +38,7 @@ class FileCompressor { * @param target_encoded_file_size * @param file_to_compress * @param archive_writer + * @param use_heuristic * @return true if the file was compressed successfully, false otherwise */ bool compress_file( diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 6804fac7a..4e6ec554b 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -329,6 +329,9 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { change_ts_pattern(timestamp_pattern); m_old_ts_pattern = timestamp_pattern; } + } else if (nullptr != m_old_ts_pattern) { + change_ts_pattern(nullptr); + m_old_ts_pattern = nullptr; } if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { split_file_and_archive( diff --git a/components/core/submodules/log-surgeon b/components/core/submodules/log-surgeon index 895f46489..0b9e45cf2 160000 --- a/components/core/submodules/log-surgeon +++ b/components/core/submodules/log-surgeon @@ -1 +1 @@ -Subproject commit 895f46489b1911ab3b3aac3202afd56c96e8cd98 +Subproject commit 0b9e45cf286c2aed6ab06840592e90f73a75a3e3 diff --git a/components/core/tests/test-Grep.cpp b/components/core/tests/test-Grep.cpp index 6d0603787..45c825cd6 100644 --- a/components/core/tests/test-Grep.cpp +++ b/components/core/tests/test-Grep.cpp @@ -1,27 +1,172 @@ +#include #include +#include +#include #include +#include #include #include #include "../src/clp/Grep.hpp" +#include "../src/clp/ir/types.hpp" +#include "../src/clp/QueryInterpretation.hpp" +#include "../src/clp/type_utils.hpp" +#include "log_surgeon/LogParser.hpp" +using clp::enum_to_underlying_type; using clp::Grep; +using clp::ir::VariablePlaceholder; using clp::load_lexer_from_file; +using clp::QueryInterpretation; +using clp::WildcardExpression; +using clp::WildcardExpressionView; +using fmt::format; +using fmt::join; +using fmt::make_format_args; +using fmt::vformat; using log_surgeon::DelimiterStringAST; using log_surgeon::lexers::ByteLexer; using log_surgeon::ParserAST; using log_surgeon::SchemaAST; using log_surgeon::SchemaParser; using log_surgeon::SchemaVarAST; +using std::apply; +using std::back_inserter; +using std::forward; +using std::index_sequence; +using std::make_index_sequence; +using std::ostream; +using std::ranges::transform; +using std::set; +using std::size_t; using std::string; +using std::string_view; +using std::unordered_map; +using std::vector; -TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { - ByteLexer forward_lexer; - load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, forward_lexer); - ByteLexer reverse_lexer; - load_lexer_from_file("../tests/test_schema_files/search_schema.txt", true, reverse_lexer); +auto operator<<(ostream& os, unordered_map const& map) -> ostream& { + os << "{ "; + for (auto const& [key, value] : map) { + os << "{" << key << ": " << value << "} "; + } + os << "}"; + return os; +} + +class ExpectedInterpretation { +public: + explicit ExpectedInterpretation(ByteLexer& lexer) : lexer(lexer) {} + + // Handles the case where `force_add_to_dictionary_list` is empty + static auto get_placeholder(string const& variable_type_name) -> char { + if (variable_type_name == "int") { + return enum_to_underlying_type(VariablePlaceholder::Integer); + } + if (variable_type_name == "float") { + return enum_to_underlying_type(VariablePlaceholder::Float); + } + return enum_to_underlying_type(VariablePlaceholder::Dictionary); + } + + static auto + get_placeholder(string const& variable_type_name, bool const force_add_to_dictionary) -> char { + if (force_add_to_dictionary) { + return enum_to_underlying_type(VariablePlaceholder::Dictionary); + } + return get_placeholder(variable_type_name); + } + + // Handles the case where there are no variable types because we can't call `get_placeholder`. + auto add_string( + string const& logtype, + string const& has_wildcard, + string const& is_encoded_with_wildcard, + string const& logtype_string + ) -> void { + expected_strings.insert( + format("logtype='{}', has_wildcard='{}', is_encoded_with_wildcard='{}', " + "logtype_string='{}'", + logtype, + has_wildcard, + is_encoded_with_wildcard, + logtype_string) + ); + } + + // TODO: Fix this so you can omit force_add_to_dictionary_list for multiple variable types. + template + auto add_string( + string const& logtype, + string const& has_wildcard, + string const& is_encoded_with_wildcard, + string const& logtype_string, + VariableTypeNames... variable_type_names, + ForceAddToDictionaryList... force_add_to_dictionary_list + ) -> void { + auto formatted_logtype + = vformat(logtype, make_format_args(lexer.m_symbol_id[variable_type_names]...)); + string formatted_logtype_string; + if constexpr (0 == sizeof...(force_add_to_dictionary_list)) { + formatted_logtype_string = vformat( + logtype_string, + make_format_args((get_placeholder(variable_type_names), ...)) + ); + } else { + formatted_logtype_string = vformat( + logtype_string, + make_format_args(get_placeholder( + variable_type_names, + force_add_to_dictionary_list + + )...) + ); + } + add_string( + formatted_logtype, + has_wildcard, + is_encoded_with_wildcard, + formatted_logtype_string + ); + } + + auto compare(string const& search_query_string) -> void { + WildcardExpression search_query(search_query_string); + set const& query_interpretations + = Grep::generate_query_substring_interpretations(search_query, lexer); + std::set actual_strings; + for (auto const& query_logtype : query_interpretations) { + std::ostringstream oss; + oss << query_logtype; + actual_strings.insert(oss.str()); + } + + // Compare element by element. + std::ostringstream oss; + oss << lexer.m_id_symbol; + CAPTURE(oss.str()); + CAPTURE(actual_strings); + CAPTURE(expected_strings); + + while (false == actual_strings.empty() && false == expected_strings.empty()) { + auto it_actual = actual_strings.begin(); + auto it_expected = expected_strings.begin(); + REQUIRE(*it_actual == *it_expected); + actual_strings.erase(it_actual); + expected_strings.erase(it_expected); + } + + // Make sure all the elements of both sets were used + REQUIRE(actual_strings == expected_strings); + } + +private: + set expected_strings; + ByteLexer& lexer; +}; + +TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { string str; size_t begin_pos; size_t end_pos; @@ -31,130 +176,50 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var str = ""; begin_pos = string::npos; end_pos = string::npos; - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // Empty string str = ""; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No tokens str = "="; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No wildcards str = " MAC address 95: ad ff 95 24 0d ff =-abc- "; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); @@ -162,15 +227,7 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var REQUIRE("-abc-" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); REQUIRE(str.length() == begin_pos); // With wildcards @@ -178,75 +235,856 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1\\*x"); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1"); REQUIRE(is_var == true); - // REQUIRE(is_var == true); - - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "abc*123"); - REQUIRE(is_var == false); - // REQUIRE(is_var == true); - - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(is_var == true); + + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1.2"); REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394/-"); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394"); REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "-*abc-"); REQUIRE(is_var == false); - REQUIRE(Grep::get_bounds_of_next_potential_var( - str, - begin_pos, - end_pos, - is_var, - forward_lexer, - reverse_lexer - ) - == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); +} + +TEST_CASE("SearchString", "[SearchString][schema_search]") { + ByteLexer lexer; + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, lexer); + + WildcardExpression const search_string("* test\\* *"); + REQUIRE(search_string.substr(0, search_string.length()) == "* test\\* *"); + for (uint32_t idx = 0; idx < search_string.length(); idx++) { + CAPTURE(idx); + if (idx == 6) { + REQUIRE(search_string.char_is_escape(idx)); + } else { + REQUIRE(false == search_string.char_is_escape(idx)); + } + } + + SECTION("surrounded_by_delims_or_wildcards and starts_or_ends_with_greedy_wildcard") { + auto search_string_view1 = WildcardExpressionView{search_string, 0, search_string.length()}; + REQUIRE(search_string_view1.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(search_string_view1.starts_or_ends_with_greedy_wildcard()); + auto search_string_view2 = WildcardExpressionView{search_string, 1, search_string.length()}; + REQUIRE(search_string_view2.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(search_string_view2.starts_or_ends_with_greedy_wildcard()); + auto search_string_view3 + = WildcardExpressionView{search_string, 0, search_string.length() - 1}; + REQUIRE(search_string_view3.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(search_string_view3.starts_or_ends_with_greedy_wildcard()); + auto search_string_view4 + = WildcardExpressionView{search_string, 2, search_string.length() - 2}; + REQUIRE(search_string_view4.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(false == search_string_view4.starts_or_ends_with_greedy_wildcard()); + auto search_string_view5 + = WildcardExpressionView{search_string, 3, search_string.length() - 3}; + REQUIRE(false == search_string_view5.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(false == search_string_view5.starts_or_ends_with_greedy_wildcard()); + auto search_string_view6 + = WildcardExpressionView{search_string, 1, search_string.length() - 1}; + REQUIRE(search_string_view6.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(false == search_string_view6.starts_or_ends_with_greedy_wildcard()); + } + + SECTION("extend_to_adjacent_greedy_wildcards") { + auto search_string_view + = WildcardExpressionView{search_string, 1, search_string.length() - 1}; + REQUIRE(8 == search_string_view.length()); + auto extended_search_string_view = search_string_view.extend_to_adjacent_greedy_wildcards(); + REQUIRE(extended_search_string_view.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(10 == extended_search_string_view.length()); + REQUIRE(extended_search_string_view.get_value() == "* test\\* *"); + + auto search_string_view2 + = WildcardExpressionView{search_string, 2, search_string.length() - 2}; + REQUIRE(6 == search_string_view2.length()); + auto extended_search_string_view2 + = search_string_view2.extend_to_adjacent_greedy_wildcards(); + REQUIRE(extended_search_string_view2.surrounded_by_delims_or_wildcards(lexer)); + REQUIRE(6 == extended_search_string_view2.length()); + REQUIRE(extended_search_string_view2.get_value() == "test\\*"); + } + + SECTION("getters") { + auto search_string_view = WildcardExpressionView{search_string, 2, search_string.length()}; + REQUIRE(false == search_string_view.is_greedy_wildcard()); + REQUIRE(false == search_string_view.is_non_greedy_wildcard()); + REQUIRE('t' == search_string_view.get_char(0)); + REQUIRE(false == search_string_view.char_is_escape(0)); + REQUIRE(false == search_string_view.char_is_greedy_wildcard(0)); + REQUIRE(false == search_string_view.char_is_non_greedy_wildcard(0)); + REQUIRE('\\' == search_string_view.get_char(4)); + REQUIRE(search_string_view.char_is_escape(4)); + REQUIRE(false == search_string_view.char_is_greedy_wildcard(4)); + REQUIRE(false == search_string_view.char_is_non_greedy_wildcard(4)); + REQUIRE('*' == search_string_view.get_char(5)); + REQUIRE(false == search_string_view.char_is_escape(5)); + REQUIRE(false == search_string_view.char_is_greedy_wildcard(5)); + REQUIRE(false == search_string_view.char_is_non_greedy_wildcard(5)); + REQUIRE('*' == search_string_view.get_char(7)); + REQUIRE(false == search_string_view.char_is_escape(7)); + REQUIRE(search_string_view.char_is_greedy_wildcard(7)); + REQUIRE(false == search_string_view.char_is_non_greedy_wildcard(7)); + } + + SECTION("Greedy Wildcard") { + auto search_string_view = WildcardExpressionView{search_string, 0, 1}; + REQUIRE(search_string_view.is_greedy_wildcard()); + REQUIRE(false == search_string_view.is_non_greedy_wildcard()); + } +} + +TEST_CASE("get_matching_variable_types", "[get_matching_variable_types][schema_search]") { + ByteLexer lexer; + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, lexer); + + SECTION("Non-wildcard search query") { + constexpr std::string_view cWildcardExprValue("* 10000 reply: *"); + constexpr std::string_view cNumber = "10000"; + constexpr size_t cFirstGreedyWildcardIdx = cWildcardExprValue.find_first_of('*'); + constexpr size_t cLastGreedyWildcardIdx = cWildcardExprValue.find_last_of('*'); + constexpr size_t cECharIdx = cWildcardExprValue.find('e'); + constexpr size_t cNumberBeginIdx = cWildcardExprValue.find(cNumber); + constexpr size_t cNumberEndIdx = cNumberBeginIdx + cNumber.length(); + WildcardExpression const wildcard_expr{string{cWildcardExprValue}}; + + // Test all subexpressions of `wildcard_expr` + for (uint32_t end_idx = 1; end_idx <= wildcard_expr.length(); end_idx++) { + for (uint32_t begin_idx = 0; begin_idx < end_idx; begin_idx++) { + auto [variable_types, contains_wildcard] = Grep::get_matching_variable_types( + WildcardExpressionView{wildcard_expr, begin_idx, end_idx}, + lexer + ); + + std::set expected_variable_types; + if ((cFirstGreedyWildcardIdx == begin_idx && cFirstGreedyWildcardIdx + 1 == end_idx) + || (cLastGreedyWildcardIdx == begin_idx && cLastGreedyWildcardIdx + 1 == end_idx + )) + { + // "*" + expected_variable_types + = {lexer.m_symbol_id["timestamp"], + lexer.m_symbol_id["int"], + lexer.m_symbol_id["float"], + lexer.m_symbol_id["hex"], + lexer.m_symbol_id["hasNumber"], + lexer.m_symbol_id["uniqueVariable"], + lexer.m_symbol_id["test"]}; + } else if (cNumberBeginIdx <= begin_idx && end_idx <= cNumberEndIdx) { + // Substrings of "10000" + expected_variable_types + = {lexer.m_symbol_id["int"], lexer.m_symbol_id["hasNumber"]}; + } else if (cECharIdx == begin_idx && cECharIdx + 1 == end_idx) { + // "e" + expected_variable_types = {lexer.m_symbol_id["hex"]}; + } + + bool expected_contains_wildcard = false; + if (cFirstGreedyWildcardIdx == begin_idx || cLastGreedyWildcardIdx + 1 == end_idx) { + expected_contains_wildcard = true; + } + + CAPTURE(wildcard_expr.substr(begin_idx, end_idx - begin_idx)); + CAPTURE(begin_idx); + CAPTURE(end_idx); + REQUIRE(variable_types == expected_variable_types); + REQUIRE(contains_wildcard == expected_contains_wildcard); + } + } + } + + SECTION("Non-greedy wildcard followed by a greedy wildcard") { + constexpr std::string_view cWildcardExprValue("?*"); + + WildcardExpression const wildcard_expr{string{cWildcardExprValue}}; + auto [variable_types, contains_wildcard] = Grep::get_matching_variable_types( + WildcardExpressionView{wildcard_expr, 0, wildcard_expr.length()}, + lexer + ); + + set expected_variable_types + = {lexer.m_symbol_id["timestamp"], + lexer.m_symbol_id["int"], + lexer.m_symbol_id["float"], + lexer.m_symbol_id["hex"], + lexer.m_symbol_id["hasNumber"], + lexer.m_symbol_id["uniqueVariable"], + lexer.m_symbol_id["test"]}; + bool expected_contains_wildcard = true; + + REQUIRE(variable_types == expected_variable_types); + REQUIRE(contains_wildcard == expected_contains_wildcard); + } +} + +TEST_CASE( + "get_interpretations_for_whole_wildcard_expr", + "[get_interpretations_for_whole_wildcard_expr][schema_search]" +) { + ByteLexer lexer; + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, lexer); + + SECTION("Non-wildcard search query") { + constexpr string_view cWildcardExprValue("* 10000 reply: *"); + constexpr string_view cNumber = "10000"; + constexpr size_t cNumberBeginIdx = cWildcardExprValue.find(cNumber); + constexpr size_t cNumberEndIdx = cNumberBeginIdx + cNumber.length(); + WildcardExpression const wildcard_expr{string{cWildcardExprValue}}; + + for (uint32_t end_idx = 1; end_idx <= wildcard_expr.length(); end_idx++) { + for (uint32_t begin_idx = 0; begin_idx < end_idx; begin_idx++) { + auto interpretations = Grep::get_interpretations_for_whole_wildcard_expr( + WildcardExpressionView{wildcard_expr, begin_idx, end_idx}, + lexer + ); + + vector expected_interpretations(0); + if (cNumberBeginIdx == begin_idx && cNumberEndIdx == end_idx) { + QueryInterpretation expected_interpretation; + expected_interpretation.append_variable_token( + static_cast(lexer.m_symbol_id["int"]), + string{cNumber}, + false, + false + ); + expected_interpretations.emplace_back(expected_interpretation); + } else if ((0 != begin_idx && wildcard_expr.length() != end_idx) + || (end_idx - begin_idx == 1)) + { + QueryInterpretation expected_interpretation; + for (uint32_t idx = begin_idx; idx < end_idx; idx++) { + expected_interpretation.append_static_token(wildcard_expr.substr(idx, 1)); + } + expected_interpretations.emplace_back(expected_interpretation); + } + + CAPTURE(begin_idx); + CAPTURE(end_idx); + REQUIRE(interpretations == expected_interpretations); + } + } + } + + SECTION("Non-greedy wildcard followed by a greedy wildcard") { + constexpr string_view cWildcardExprValue(" ?* "); + WildcardExpression const wildcard_expr{string{cWildcardExprValue}}; + + auto interpretations = Grep::get_interpretations_for_whole_wildcard_expr( + WildcardExpressionView{wildcard_expr, 1, 2}, + lexer + ); + vector expected_interpretations(0); + + { + QueryInterpretation expected_interpretation; + expected_interpretation.append_static_token("?"); + expected_interpretations.emplace_back(expected_interpretation); + } + + for (auto const& var_type : {"int", "float"}) { + for (auto const encoded : {true, false}) { + QueryInterpretation expected_interpretation; + expected_interpretation.append_variable_token( + static_cast(lexer.m_symbol_id[var_type]), + string{"?*"}, + true, + encoded + ); + expected_interpretations.emplace_back(expected_interpretation); + } + } + + // Note: all the other non-encodable variable types are ignored because CLP considers them + // to be the same as timestamp (i.e., they're all stored in the dictionary). + for (auto const& var_type : {"timestamp"}) { + QueryInterpretation expected_interpretation; + expected_interpretation.append_variable_token( + static_cast(lexer.m_symbol_id[var_type]), + string{"?*"}, + true, + false + ); + expected_interpretations.emplace_back(expected_interpretation); + } + + std::ostringstream oss; + oss << lexer.m_id_symbol; + CAPTURE(oss.str()); + REQUIRE(interpretations == expected_interpretations); + } +} + +TEST_CASE( + "generate_query_substring_interpretations", + "[generate_query_substring_interpretations][schema_search]" +) { + ByteLexer lexer; + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, lexer); + + SECTION("Query with static text") { + ExpectedInterpretation exp_interp(lexer); + + exp_interp.add_string("* z *", "0", "0", "* z *"); + + exp_interp.compare("* z *"); + } + SECTION("Query with a hex value") { + ExpectedInterpretation exp_interp(lexer); + + // "* a *" + exp_interp.add_string("* a *", "0", "0", "* a *"); + // "* (a) *" + exp_interp.add_string("* <{}>(a) *", "000", "000", "* {} *", "hex"); + + exp_interp.compare("* a *"); + } + SECTION("Query with an integer") { + ExpectedInterpretation exp_interp(lexer); + + // "* 10000 reply: *" + exp_interp.add_string("* 10000 reply: *", "0", "0", "* 10000 reply: *"); + // "* (10000) reply: *" + exp_interp + .add_string("* <{}>(10000) reply: *", "000", "000", "* {} reply: *", "int"); + + exp_interp.compare("* 10000 reply: *"); + } + SECTION("Query with a non-greedy wildcard at the start of a variable") { + ExpectedInterpretation exp_interp(lexer); + + // "* ?10000 *" + exp_interp.add_string("* ?10000 *", "0", "0", "* ?10000 *"); + // "* ?(10000) *" + exp_interp.add_string("* ?<{}>(10000) *", "000", "000", "* ?{} *", "int"); + // "* (?10000) *" + // TODO: Add logic to determine this case is impossible. + exp_interp.add_string("* <{}>(?10000) *", "010", "000", "* {} *", "int", true); + exp_interp.add_string("* <{}>(?10000) *", "010", "010", "* {} *", "int", false); + // "* (?10000) *" + exp_interp.add_string("* <{}>(?10000) *", "010", "000", "* {} *", "hasNumber"); + + exp_interp.compare("* ?10000 *"); + } + SECTION("Query with a non-greedy wildcard at the end of a variable") { + ExpectedInterpretation exp_interp(lexer); + + // "* 10000? *" + exp_interp.add_string("* 10000? *", "0", "0", "* 10000? *"); + // "* (10000)? *" + exp_interp.add_string("* <{}>(10000)? *", "000", "000", "* {}? *", "int"); + // "* (10000?) *" + exp_interp.add_string("* <{}>(10000?) *", "010", "000", "* {} *", "int", true); + exp_interp.add_string("* <{}>(10000?) *", "010", "010", "* {} *", "int", false); + // "* (10000?) *" + exp_interp.add_string("* <{}>(10000?) *", "010", "000", "* {} *", "hasNumber"); + + exp_interp.compare("* 10000? *"); + } + SECTION("Query with a non-greedy wildcard in the middle of a variable") { + ExpectedInterpretation exp_interp(lexer); + + // "* 10000? *" + exp_interp.add_string("* 100?00 *", "0", "0", "* 100?00 *"); + // "* (100?00) *" + exp_interp.add_string("* <{}>(100?00) *", "010", "010", "* {} *", "int", false); + // TODO: add logic to determine this case is impossible + exp_interp.add_string("* <{}>(100?00) *", "010", "000", "* {} *", "int", true); + // "* (100?00) *" + exp_interp.add_string("* <{}>(100?00) *", "010", "010", "* {} *", "float", false); + // TODO: add logic to determine this case is impossible + exp_interp.add_string("* <{}>(100?00) *", "010", "000", "* {} *", "float", true); + // "* (100?00) *" + exp_interp.add_string("* <{}>(100?00) *", "010", "000", "* {} *", "hasNumber"); + // "* (100)?00 *" + // TODO: Add logic to determine this case is impossible. + exp_interp.add_string("* <{}>(100)?00 *", "000", "000", "* {}?00 *", "int"); + // "* 100?(00) *" + // TODO: Add logic to determine this case is impossible. + exp_interp.add_string("* 100?<{}>(00) *", "000", "000", "* 100?{} *", "int", true); + // "* (100)?(00) *" + exp_interp.add_string( + "* <{}>(100)?<{}>(00) *", + "00000", + "00000", + "* {}?{} *", + "int", + "int", + false, + true + ); + + exp_interp.compare("* 100?00 *"); + } + SECTION("Query with a non-greedy wildcard and escaped wildcard") { + ExpectedInterpretation exp_interp(lexer); + + // "* 10\\?000? *" + exp_interp.add_string("* 10\\?000? *", "0", "0", "* 10\\?000? *"); + // "* (10)\\?000? *" + exp_interp.add_string( + "* <{}>(10)\\?000? *", + "000", + "000", + "* {}\\?000? *", + "int", + false + ); + // "* (10)\\?(000)? *" + exp_interp.add_string( + "* <{}>(10)\\?<{}>(000)? *", + "00000", + "00000", + "* {}\\?{}? *", + "int", + "int", + false, + true + ); + // "* (10)\\?(000?) *" + exp_interp.add_string( + "* <{}>(10)\\?<{}>(000?) *", + "00010", + "00010", + "* {}\\?{} *", + "int", + "int", + false, + false + ); + exp_interp.add_string( + "* <{}>(10)\\?<{}>(000?) *", + "00010", + "00000", + "* {}\\?{} *", + "int", + "int", + false, + true + ); + // "* (10)\\?(000?) *" + exp_interp.add_string( + "* <{}>(10)\\?<{}>(000?) *", + "00010", + "00000", + "* {}\\?{} *", + "int", + "hasNumber", + false, + true + ); + // "* 10\\?(000)? *" + exp_interp.add_string( + "* 10\\?<{}>(000)? *", + "000", + "000", + "* 10\\?{}? *", + "int", + true + ); + // "* 10\\?(000?) *" + exp_interp.add_string( + "* 10\\?<{}>(000?) *", + "010", + "000", + "* 10\\?{} *", + "int", + true + ); + exp_interp.add_string( + "* 10\\?<{}>(000?) *", + "010", + "010", + "* 10\\?{} *", + "int", + false + ); + // "* 10\\?(000?) *" + exp_interp.add_string( + "* 10\\?<{}>(000?) *", + "010", + "000", + "* 10\\?{} *", + "hasNumber", + false + ); + + exp_interp.compare("* 10\\?000? *"); + } + SECTION("Query with greedy wildcard") { + ExpectedInterpretation exp_interp(lexer); + + // "* *10000 *" + exp_interp.add_string("* *10000 *", "0", "0", "* *10000 *"); + // "*(* *)*10000 *" + exp_interp.add_string( + "*<{}>(* *)*10000 *", + "010", + "000", + "*{}*10000 *", + "timestamp", + false + ); + // "* *(*10000) *" + exp_interp.add_string("* *<{}>(*10000) *", "010", "000", "* *{} *", "int", true); + exp_interp.add_string("* *<{}>(*10000) *", "010", "010", "* *{} *", "int", false); + // "* *(*10000) *" + exp_interp.add_string("* *<{}>(*10000) *", "010", "000", "* *{} *", "float", true); + exp_interp.add_string("* *<{}>(*10000) *", "010", "010", "* *{} *", "float", false); + // "* *(*10000) *" + exp_interp.add_string("* *<{}>(*10000) *", "010", "000", "* *{} *", "hasNumber"); + // "*(* *)*(*10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "int", + false, + true + ); + exp_interp.add_string( + "*<{}>(* *)*<{}>(*10000) *", + "01010", + "00010", + "*{}*{} *", + "timestamp", + "int", + false, + false + ); + // "*(* *)*(*10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "float", + false, + true + ); + exp_interp.add_string( + "*<{}>(* *)*<{}>(*10000) *", + "01010", + "00010", + "*{}*{} *", + "timestamp", + "float", + false, + false + ); + // "*(* *)*(*10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "hasNumber", + false, + false + ); + + exp_interp.compare("* *10000 *"); + } + SECTION("Query with greedy wildcard followed by non-greedy wildcard") { + ExpectedInterpretation exp_interp(lexer); + + // "* *?10000 *" + exp_interp.add_string("* *?10000 *", "0", "0", "* *?10000 *"); + // "*(* *)*?10000 *" + exp_interp.add_string( + "*<{}>(* *)*?10000 *", + "010", + "000", + "*{}*?10000 *", + "timestamp" + ); + // "*(* *)*(*?10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*?10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "int", + false, + true + ); + exp_interp.add_string( + "*<{}>(* *)*<{}>(*?10000) *", + "01010", + "00010", + "*{}*{} *", + "timestamp", + "int", + false, + false + ); + // "*(* *)*(*?10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*?10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "float", + false, + true + ); + exp_interp.add_string( + "*<{}>(* *)*<{}>(*?10000) *", + "01010", + "00010", + "*{}*{} *", + "timestamp", + "float", + false, + false + ); + // "*(* *)*(*?10000) *" + exp_interp.add_string( + "*<{}>(* *)*<{}>(*?10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "hasNumber", + false, + false + ); + // "*(* *)*?(10000) *" + exp_interp.add_string( + "*<{}>(* *)*?<{}>(10000) *", + "01000", + "00000", + "*{}*?{} *", + "timestamp", + "int", + false, + false + ); + // "* *(*?10000) *" + exp_interp.add_string("* *<{}>(*?10000) *", "010", "000", "* *{} *", "int", true); + exp_interp.add_string("* *<{}>(*?10000) *", "010", "010", "* *{} *", "int", false); + // "* *(*?10000) *" + exp_interp.add_string("* *<{}>(*?10000) *", "010", "000", "* *{} *", "float", true); + exp_interp + .add_string("* *<{}>(*?10000) *", "010", "010", "* *{} *", "float", false); + // "* *(*?10000) *" + exp_interp.add_string("* *<{}>(*?10000) *", "010", "000", "* *{} *", "hasNumber"); + // "* *?(10000) *" + exp_interp.add_string("* *?<{}>(10000) *", "000", "000", "* *?{} *", "int"); + + exp_interp.compare("* *?10000 *"); + } + SECTION("Query with non-greedy wildcard followed by greedy wildcard") { + ExpectedInterpretation exp_interp(lexer); + + // "* ?*10000 *" + exp_interp.add_string("* ?*10000 *", "0", "0", "* ?*10000 *"); + // "*(* ?*)*10000 *" + exp_interp.add_string( + "*<{}>(* ?*)*10000 *", + "010", + "000", + "*{}*10000 *", + "timestamp" + ); + // "*(* ?*)*(*10000) *" + exp_interp.add_string( + "*<{}>(* ?*)*<{}>(*10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + "hasNumber", + false, + false + ); + // "* (?*10000) *" + exp_interp.add_string("* <{}>(?*10000) *", "010", "000", "* {} *", "hasNumber"); + // "* (*10000) *" + exp_interp.add_string("* ?*<{}>(*10000) *", "010", "000", "* ?*{} *", "hasNumber"); + // Note: all the other non-encodable variable types are ignored because CLP considers them + // to be the same as timestamp (i.e., they're all stored in the dictionary). + for (auto type1 : {"timestamp"}) { + // "* (?*)*10000 *" + exp_interp + .add_string("* <{}>(?*)*10000 *", "010", "000", "* {}*10000 *", type1); + for (auto type2 : {"int", "float"}) { + // "* (?*)*(*10000) *" + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00000", + "* {}*{} *", + type1, + type2, + false, + true + ); + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00010", + "* {}*{} *", + type1, + type2, + false, + false + ); + } + // "* (?*)*(*10000) *" + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00000", + "* {}*{} *", + type1, + "hasNumber", + false, + false + ); + } + for (auto type1 : {"int", "float"}) { + // "*(* ?*)*(*10000) *" + exp_interp.add_string( + "*<{}>(* ?*)*<{}>(*10000) *", + "01010", + "00000", + "*{}*{} *", + "timestamp", + type1, + false, + true + ); + exp_interp.add_string( + "*<{}>(* ?*)*<{}>(*10000) *", + "01010", + "00010", + "*{}*{} *", + "timestamp", + type1, + false, + false + ); + // "* ?*(*10000) *" + exp_interp.add_string( + "* ?*<{}>(*10000) *", + "010", + "000", + "* ?*{} *", + type1, + true + ); + exp_interp.add_string( + "* ?*<{}>(*10000) *", + "010", + "010", + "* ?*{} *", + type1, + false + ); + // "* (?*10000) *" + exp_interp.add_string("* <{}>(?*10000) *", "010", "000", "* {} *", type1, true); + exp_interp + .add_string("* <{}>(?*10000) *", "010", "010", "* {} *", type1, false); + // "* (?*)*10000 *" + exp_interp.add_string( + "* <{}>(?*)*10000 *", + "010", + "000", + "* {}*10000 *", + type1, + true + ); + exp_interp.add_string( + "* <{}>(?*)*10000 *", + "010", + "010", + "* {}*10000 *", + type1, + false + ); + for (auto type2 : {"int", "float"}) { + // "* (?*)*(*10000) *" + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00000", + "* {}*{} *", + type1, + type2, + true, + true + ); + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00010", + "* {}*{} *", + type1, + type2, + true, + false + ); + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "01000", + "* {}*{} *", + type1, + type2, + false, + true + ); + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "01010", + "* {}*{} *", + type1, + type2, + false, + false + ); + } + // "* (?*)*(*10000) *" + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "00000", + "* {}*{} *", + type1, + "hasNumber", + true, + false + ); + exp_interp.add_string( + "* <{}>(?*)*<{}>(*10000) *", + "01010", + "01000", + "* {}*{} *", + type1, + "hasNumber", + false, + false + ); + } + exp_interp.compare("* ?*10000 *"); + } } diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 3689c69e8..d73260d9d 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -29,7 +29,6 @@ using log_surgeon::SchemaVarAST; using log_surgeon::Token; std::unique_ptr generate_schema_ast(std::string const& schema_file) { - SchemaParser schema_parser; std::unique_ptr schema_ast = SchemaParser::try_schema_file(schema_file); REQUIRE(schema_ast.get() != nullptr); return schema_ast;