From 1a17b8575cf1c43b91fefb27f0f06f0046d244a1 Mon Sep 17 00:00:00 2001 From: wraymo <37269683+wraymo@users.noreply.github.com> Date: Tue, 9 Jan 2024 16:44:18 -0500 Subject: [PATCH] Add clp-s for compressing and searching semi-structured logs. (#217) --- .gitmodules | 6 + components/core/.clang-format | 4 +- components/core/.gitignore | 1 + components/core/CMakeLists.txt | 17 + components/core/README.md | 119 +- .../cmake/Modules/ExternalAntlr4Cpp.cmake | 180 +++ components/core/cmake/Modules/FindANTLR.cmake | 139 ++ components/core/src/clp_s/ArchiveReader.cpp | 82 ++ components/core/src/clp_s/ArchiveReader.hpp | 71 + components/core/src/clp_s/ArchiveWriter.cpp | 124 ++ components/core/src/clp_s/ArchiveWriter.hpp | 94 ++ components/core/src/clp_s/CMakeLists.txt | 137 ++ components/core/src/clp_s/ColumnReader.cpp | 177 +++ components/core/src/clp_s/ColumnReader.hpp | 265 ++++ components/core/src/clp_s/ColumnWriter.cpp | 142 ++ components/core/src/clp_s/ColumnWriter.hpp | 232 ++++ .../core/src/clp_s/CommandLineArguments.cpp | 298 +++++ .../core/src/clp_s/CommandLineArguments.hpp | 74 ++ components/core/src/clp_s/Compressor.hpp | 51 + components/core/src/clp_s/Decompressor.hpp | 64 + components/core/src/clp_s/Defs.hpp | 44 + components/core/src/clp_s/DictionaryEntry.cpp | 257 ++++ components/core/src/clp_s/DictionaryEntry.hpp | 290 ++++ .../core/src/clp_s/DictionaryReader.hpp | 210 +++ .../core/src/clp_s/DictionaryWriter.cpp | 67 + .../core/src/clp_s/DictionaryWriter.hpp | 158 +++ components/core/src/clp_s/ErrorCode.hpp | 31 + components/core/src/clp_s/FileReader.cpp | 150 +++ components/core/src/clp_s/FileReader.hpp | 166 +++ components/core/src/clp_s/FileWriter.cpp | 165 +++ components/core/src/clp_s/FileWriter.hpp | 122 ++ components/core/src/clp_s/JsonConstructor.cpp | 72 + components/core/src/clp_s/JsonConstructor.hpp | 59 + .../core/src/clp_s/JsonFileIterator.cpp | 129 ++ .../core/src/clp_s/JsonFileIterator.hpp | 75 ++ components/core/src/clp_s/JsonParser.cpp | 298 +++++ components/core/src/clp_s/JsonParser.hpp | 101 ++ components/core/src/clp_s/JsonSerializer.hpp | 83 ++ components/core/src/clp_s/ParsedMessage.hpp | 54 + components/core/src/clp_s/ReaderUtils.cpp | 231 ++++ components/core/src/clp_s/ReaderUtils.hpp | 118 ++ components/core/src/clp_s/SchemaMap.cpp | 37 + components/core/src/clp_s/SchemaMap.hpp | 48 + components/core/src/clp_s/SchemaReader.cpp | 288 ++++ components/core/src/clp_s/SchemaReader.hpp | 153 +++ components/core/src/clp_s/SchemaTree.cpp | 25 + components/core/src/clp_s/SchemaTree.hpp | 99 ++ components/core/src/clp_s/SchemaWriter.cpp | 56 + components/core/src/clp_s/SchemaWriter.hpp | 61 + .../src/clp_s/TimestampDictionaryReader.cpp | 91 ++ .../src/clp_s/TimestampDictionaryReader.hpp | 99 ++ .../src/clp_s/TimestampDictionaryWriter.cpp | 146 ++ .../src/clp_s/TimestampDictionaryWriter.hpp | 95 ++ components/core/src/clp_s/TimestampEntry.cpp | 345 +++++ components/core/src/clp_s/TimestampEntry.hpp | 101 ++ .../core/src/clp_s/TimestampPattern.cpp | 1008 ++++++++++++++ .../core/src/clp_s/TimestampPattern.hpp | 166 +++ .../core/src/clp_s/TraceableException.hpp | 49 + components/core/src/clp_s/Utils.cpp | 431 ++++++ components/core/src/clp_s/Utils.hpp | 273 ++++ components/core/src/clp_s/VariableDecoder.cpp | 118 ++ components/core/src/clp_s/VariableDecoder.hpp | 61 + components/core/src/clp_s/VariableEncoder.cpp | 184 +++ components/core/src/clp_s/VariableEncoder.hpp | 71 + components/core/src/clp_s/ZstdCompressor.cpp | 120 ++ components/core/src/clp_s/ZstdCompressor.hpp | 98 ++ .../core/src/clp_s/ZstdDecompressor.cpp | 238 ++++ .../core/src/clp_s/ZstdDecompressor.hpp | 146 ++ components/core/src/clp_s/clp-s.cpp | 125 ++ components/core/src/clp_s/search/AndExpr.cpp | 57 + components/core/src/clp_s/search/AndExpr.hpp | 58 + .../core/src/clp_s/search/BooleanLiteral.cpp | 44 + .../core/src/clp_s/search/BooleanLiteral.hpp | 58 + .../src/clp_s/search/ColumnDescriptor.cpp | 90 ++ .../src/clp_s/search/ColumnDescriptor.hpp | 214 +++ .../core/src/clp_s/search/ConstantProp.cpp | 43 + .../core/src/clp_s/search/ConstantProp.hpp | 23 + .../core/src/clp_s/search/ConvertToExists.cpp | 116 ++ .../core/src/clp_s/search/ConvertToExists.hpp | 29 + .../core/src/clp_s/search/DateLiteral.cpp | 92 ++ .../core/src/clp_s/search/DateLiteral.hpp | 65 + .../core/src/clp_s/search/EmptyExpr.cpp | 27 + .../core/src/clp_s/search/EmptyExpr.hpp | 37 + .../clp_s/search/EvaluateTimestampIndex.cpp | 103 ++ .../clp_s/search/EvaluateTimestampIndex.hpp | 31 + .../core/src/clp_s/search/Expression.cpp | 35 + .../core/src/clp_s/search/Expression.hpp | 118 ++ .../core/src/clp_s/search/FilterExpr.cpp | 106 ++ .../core/src/clp_s/search/FilterExpr.hpp | 100 ++ .../core/src/clp_s/search/FilterOperation.hpp | 20 + components/core/src/clp_s/search/Integral.cpp | 96 ++ components/core/src/clp_s/search/Integral.hpp | 84 ++ components/core/src/clp_s/search/Literal.hpp | 115 ++ .../core/src/clp_s/search/NarrowTypes.cpp | 76 ++ .../core/src/clp_s/search/NarrowTypes.hpp | 22 + .../core/src/clp_s/search/NullLiteral.cpp | 32 + .../core/src/clp_s/search/NullLiteral.hpp | 54 + components/core/src/clp_s/search/OrExpr.cpp | 55 + components/core/src/clp_s/search/OrExpr.hpp | 53 + .../core/src/clp_s/search/OrOfAndForm.cpp | 179 +++ .../core/src/clp_s/search/OrOfAndForm.hpp | 66 + components/core/src/clp_s/search/Output.cpp | 1182 +++++++++++++++++ components/core/src/clp_s/search/Output.hpp | 338 +++++ .../core/src/clp_s/search/SchemaMatch.cpp | 452 +++++++ .../core/src/clp_s/search/SchemaMatch.hpp | 172 +++ .../core/src/clp_s/search/SearchUtils.cpp | 87 ++ .../core/src/clp_s/search/SearchUtils.hpp | 48 + .../core/src/clp_s/search/StringLiteral.cpp | 95 ++ .../core/src/clp_s/search/StringLiteral.hpp | 78 ++ .../core/src/clp_s/search/Transformation.hpp | 21 + components/core/src/clp_s/search/Value.hpp | 33 + .../clp_search/EncodedVariableInterpreter.cpp | 75 ++ .../clp_search/EncodedVariableInterpreter.hpp | 84 ++ .../core/src/clp_s/search/clp_search/Grep.cpp | 639 +++++++++ .../core/src/clp_s/search/clp_search/Grep.hpp | 54 + .../src/clp_s/search/clp_search/Query.cpp | 150 +++ .../src/clp_s/search/clp_search/Query.hpp | 192 +++ .../core/src/clp_s/search/kql/CMakeLists.txt | 28 + components/core/src/clp_s/search/kql/Kql.g4 | 107 ++ components/core/src/clp_s/search/kql/kql.cpp | 248 ++++ components/core/src/clp_s/search/kql/kql.hpp | 17 + components/core/submodules/abseil-cpp | 1 + components/core/submodules/simdjson | 1 + .../scripts/deps-download/abseil-cpp.json | 10 + .../tools/scripts/deps-download/antlr4.json | 14 + .../scripts/deps-download/download-all.sh | 3 + .../tools/scripts/deps-download/simdjson.json | 11 + .../centos7.4/install-prebuilt-packages.sh | 1 + .../lib_install/macos-12/install-all.sh | 1 + .../ubuntu-focal/install-prebuilt-packages.sh | 1 + .../ubuntu-jammy/install-prebuilt-packages.sh | 1 + docs/core/clp-structured.md | 125 ++ docs/core/clp-unstructured.md | 157 +++ 133 files changed, 16503 insertions(+), 110 deletions(-) create mode 100644 components/core/cmake/Modules/ExternalAntlr4Cpp.cmake create mode 100644 components/core/cmake/Modules/FindANTLR.cmake create mode 100644 components/core/src/clp_s/ArchiveReader.cpp create mode 100644 components/core/src/clp_s/ArchiveReader.hpp create mode 100644 components/core/src/clp_s/ArchiveWriter.cpp create mode 100644 components/core/src/clp_s/ArchiveWriter.hpp create mode 100644 components/core/src/clp_s/CMakeLists.txt create mode 100644 components/core/src/clp_s/ColumnReader.cpp create mode 100644 components/core/src/clp_s/ColumnReader.hpp create mode 100644 components/core/src/clp_s/ColumnWriter.cpp create mode 100644 components/core/src/clp_s/ColumnWriter.hpp create mode 100644 components/core/src/clp_s/CommandLineArguments.cpp create mode 100644 components/core/src/clp_s/CommandLineArguments.hpp create mode 100644 components/core/src/clp_s/Compressor.hpp create mode 100644 components/core/src/clp_s/Decompressor.hpp create mode 100644 components/core/src/clp_s/Defs.hpp create mode 100644 components/core/src/clp_s/DictionaryEntry.cpp create mode 100644 components/core/src/clp_s/DictionaryEntry.hpp create mode 100644 components/core/src/clp_s/DictionaryReader.hpp create mode 100644 components/core/src/clp_s/DictionaryWriter.cpp create mode 100644 components/core/src/clp_s/DictionaryWriter.hpp create mode 100644 components/core/src/clp_s/ErrorCode.hpp create mode 100644 components/core/src/clp_s/FileReader.cpp create mode 100644 components/core/src/clp_s/FileReader.hpp create mode 100644 components/core/src/clp_s/FileWriter.cpp create mode 100644 components/core/src/clp_s/FileWriter.hpp create mode 100644 components/core/src/clp_s/JsonConstructor.cpp create mode 100644 components/core/src/clp_s/JsonConstructor.hpp create mode 100644 components/core/src/clp_s/JsonFileIterator.cpp create mode 100644 components/core/src/clp_s/JsonFileIterator.hpp create mode 100644 components/core/src/clp_s/JsonParser.cpp create mode 100644 components/core/src/clp_s/JsonParser.hpp create mode 100644 components/core/src/clp_s/JsonSerializer.hpp create mode 100644 components/core/src/clp_s/ParsedMessage.hpp create mode 100644 components/core/src/clp_s/ReaderUtils.cpp create mode 100644 components/core/src/clp_s/ReaderUtils.hpp create mode 100644 components/core/src/clp_s/SchemaMap.cpp create mode 100644 components/core/src/clp_s/SchemaMap.hpp create mode 100644 components/core/src/clp_s/SchemaReader.cpp create mode 100644 components/core/src/clp_s/SchemaReader.hpp create mode 100644 components/core/src/clp_s/SchemaTree.cpp create mode 100644 components/core/src/clp_s/SchemaTree.hpp create mode 100644 components/core/src/clp_s/SchemaWriter.cpp create mode 100644 components/core/src/clp_s/SchemaWriter.hpp create mode 100644 components/core/src/clp_s/TimestampDictionaryReader.cpp create mode 100644 components/core/src/clp_s/TimestampDictionaryReader.hpp create mode 100644 components/core/src/clp_s/TimestampDictionaryWriter.cpp create mode 100644 components/core/src/clp_s/TimestampDictionaryWriter.hpp create mode 100644 components/core/src/clp_s/TimestampEntry.cpp create mode 100644 components/core/src/clp_s/TimestampEntry.hpp create mode 100644 components/core/src/clp_s/TimestampPattern.cpp create mode 100644 components/core/src/clp_s/TimestampPattern.hpp create mode 100644 components/core/src/clp_s/TraceableException.hpp create mode 100644 components/core/src/clp_s/Utils.cpp create mode 100644 components/core/src/clp_s/Utils.hpp create mode 100644 components/core/src/clp_s/VariableDecoder.cpp create mode 100644 components/core/src/clp_s/VariableDecoder.hpp create mode 100644 components/core/src/clp_s/VariableEncoder.cpp create mode 100644 components/core/src/clp_s/VariableEncoder.hpp create mode 100644 components/core/src/clp_s/ZstdCompressor.cpp create mode 100644 components/core/src/clp_s/ZstdCompressor.hpp create mode 100644 components/core/src/clp_s/ZstdDecompressor.cpp create mode 100644 components/core/src/clp_s/ZstdDecompressor.hpp create mode 100644 components/core/src/clp_s/clp-s.cpp create mode 100644 components/core/src/clp_s/search/AndExpr.cpp create mode 100644 components/core/src/clp_s/search/AndExpr.hpp create mode 100644 components/core/src/clp_s/search/BooleanLiteral.cpp create mode 100644 components/core/src/clp_s/search/BooleanLiteral.hpp create mode 100644 components/core/src/clp_s/search/ColumnDescriptor.cpp create mode 100644 components/core/src/clp_s/search/ColumnDescriptor.hpp create mode 100644 components/core/src/clp_s/search/ConstantProp.cpp create mode 100644 components/core/src/clp_s/search/ConstantProp.hpp create mode 100644 components/core/src/clp_s/search/ConvertToExists.cpp create mode 100644 components/core/src/clp_s/search/ConvertToExists.hpp create mode 100644 components/core/src/clp_s/search/DateLiteral.cpp create mode 100644 components/core/src/clp_s/search/DateLiteral.hpp create mode 100644 components/core/src/clp_s/search/EmptyExpr.cpp create mode 100644 components/core/src/clp_s/search/EmptyExpr.hpp create mode 100644 components/core/src/clp_s/search/EvaluateTimestampIndex.cpp create mode 100644 components/core/src/clp_s/search/EvaluateTimestampIndex.hpp create mode 100644 components/core/src/clp_s/search/Expression.cpp create mode 100644 components/core/src/clp_s/search/Expression.hpp create mode 100644 components/core/src/clp_s/search/FilterExpr.cpp create mode 100644 components/core/src/clp_s/search/FilterExpr.hpp create mode 100644 components/core/src/clp_s/search/FilterOperation.hpp create mode 100644 components/core/src/clp_s/search/Integral.cpp create mode 100644 components/core/src/clp_s/search/Integral.hpp create mode 100644 components/core/src/clp_s/search/Literal.hpp create mode 100644 components/core/src/clp_s/search/NarrowTypes.cpp create mode 100644 components/core/src/clp_s/search/NarrowTypes.hpp create mode 100644 components/core/src/clp_s/search/NullLiteral.cpp create mode 100644 components/core/src/clp_s/search/NullLiteral.hpp create mode 100644 components/core/src/clp_s/search/OrExpr.cpp create mode 100644 components/core/src/clp_s/search/OrExpr.hpp create mode 100644 components/core/src/clp_s/search/OrOfAndForm.cpp create mode 100644 components/core/src/clp_s/search/OrOfAndForm.hpp create mode 100644 components/core/src/clp_s/search/Output.cpp create mode 100644 components/core/src/clp_s/search/Output.hpp create mode 100644 components/core/src/clp_s/search/SchemaMatch.cpp create mode 100644 components/core/src/clp_s/search/SchemaMatch.hpp create mode 100644 components/core/src/clp_s/search/SearchUtils.cpp create mode 100644 components/core/src/clp_s/search/SearchUtils.hpp create mode 100644 components/core/src/clp_s/search/StringLiteral.cpp create mode 100644 components/core/src/clp_s/search/StringLiteral.hpp create mode 100644 components/core/src/clp_s/search/Transformation.hpp create mode 100644 components/core/src/clp_s/search/Value.hpp create mode 100644 components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.cpp create mode 100644 components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.hpp create mode 100644 components/core/src/clp_s/search/clp_search/Grep.cpp create mode 100644 components/core/src/clp_s/search/clp_search/Grep.hpp create mode 100644 components/core/src/clp_s/search/clp_search/Query.cpp create mode 100644 components/core/src/clp_s/search/clp_search/Query.hpp create mode 100644 components/core/src/clp_s/search/kql/CMakeLists.txt create mode 100644 components/core/src/clp_s/search/kql/Kql.g4 create mode 100644 components/core/src/clp_s/search/kql/kql.cpp create mode 100644 components/core/src/clp_s/search/kql/kql.hpp create mode 160000 components/core/submodules/abseil-cpp create mode 160000 components/core/submodules/simdjson create mode 100644 components/core/tools/scripts/deps-download/abseil-cpp.json create mode 100644 components/core/tools/scripts/deps-download/antlr4.json create mode 100644 components/core/tools/scripts/deps-download/simdjson.json create mode 100644 docs/core/clp-structured.md create mode 100644 docs/core/clp-unstructured.md diff --git a/.gitmodules b/.gitmodules index 4b3b13551..614f0871e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -17,3 +17,9 @@ [submodule "components/core/submodules/boost-outcome"] path = components/core/submodules/boost-outcome url = https://github.com/boostorg/outcome.git +[submodule "components/core/submodules/simdjson"] + path = components/core/submodules/simdjson + url = https://github.com/simdjson/simdjson.git +[submodule "components/core/submodules/abseil-cpp"] + path = components/core/submodules/abseil-cpp + url = https://github.com/abseil/abseil-cpp.git diff --git a/components/core/.clang-format b/components/core/.clang-format index fed2096cb..fbaf8f62e 100644 --- a/components/core/.clang-format +++ b/components/core/.clang-format @@ -72,8 +72,8 @@ IncludeCategories: # NOTE: A header is grouped by first matching regex # Library headers. Update when adding new libraries. # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec. - - Regex: "^<(archive|boost|catch2|date|fmt|json|log_surgeon|mariadb|spdlog|sqlite3|string_utils\ -|yaml-cpp|zstd)" + - Regex: "<(absl|antlr4|archive|boost|catch2|date|fmt|json|log_surgeon|mariadb|simdjson|spdlog\ +|sqlite3|string_utils|yaml-cpp|zstd)" Priority: 3 # C system headers - Regex: "^<.+\\.h>" diff --git a/components/core/.gitignore b/components/core/.gitignore index a47a8cbd7..18670ec68 100644 --- a/components/core/.gitignore +++ b/components/core/.gitignore @@ -1,2 +1,3 @@ build/** submodules/sqlite3/* +third-party/** diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 35ebc84ac..9007f9328 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -75,6 +75,15 @@ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif () endif () +# Find and setup ANTLR Library +# We build and link to the static library +find_package(ANTLR REQUIRED) +if (ANTLR_FOUND) + message(STATUS "Found ANTLR ${ANTLR_VERSION}") +else() + message(FATAL_ERROR "Could not find libraries for ANTLR ${ANTLR4_TAG}") +endif() + # Find and setup Boost Library if(CLP_USE_STATIC_LIBS) set(Boost_USE_STATIC_LIBS ON) @@ -142,6 +151,13 @@ else() message(FATAL_ERROR "Could not find msgpack-cxx") endif() +# Add abseil-cpp +set(ABSL_PROPAGATE_CXX_STD ON) +add_subdirectory(submodules/abseil-cpp EXCLUDE_FROM_ALL) + +# Add simdjson +add_subdirectory(submodules/simdjson EXCLUDE_FROM_ALL) + # Add yaml-cpp add_subdirectory(submodules/yaml-cpp EXCLUDE_FROM_ALL) @@ -167,6 +183,7 @@ add_subdirectory(src/clp/clg) add_subdirectory(src/clp/clo) add_subdirectory(src/clp/clp) add_subdirectory(src/clp/make_dictionaries_readable) +add_subdirectory(src/clp_s) set(SOURCE_FILES_unitTest src/clp/BufferedFileReader.cpp diff --git a/components/core/README.md b/components/core/README.md index cd2721d0e..b8d283e4f 100644 --- a/components/core/README.md +++ b/components/core/README.md @@ -12,10 +12,7 @@ CLP core is the low-level component that performs compression, decompression, an * [Docker Environment](#docker-environment) * [Build](#build) * [Running](#running) - * [`clp`](#clp) - * [`clg`](#clg) - * [`make-dictionaries-readable`](#make-dictionaries-readable) -* [Parallel Compression](#parallel-compression) + ## Requirements @@ -36,10 +33,13 @@ tools/scripts/deps-download/download-all.sh ``` This will download: +* [abseil-cpp](https://github.com/abseil/abseil-cpp) (20230802.1) +* [ANTLR](https://www.antlr.org) (v4.13.1) * [Catch2](https://github.com/catchorg/Catch2.git) (v2.13.7) * [date](https://github.com/HowardHinnant/date.git) (v3.0.1) * [json](https://github.com/nlohmann/json.git) (v3.10.4) * [log-surgeon](https://github.com/y-scope/log-surgeon) (895f464) +* [simdjson](https://github.com/simdjson/simdjson) (v3.6.3) * [SQLite3](https://www.sqlite.org/download.html) (v3.36.0) * [yaml-cpp](https://github.com/jbeder/yaml-cpp.git) (v0.7.0) @@ -98,108 +98,11 @@ the relevant paths on your machine. ## Running -* CLP contains two core executables: `clp` and `clg` - * `clp` is used for compressing and extracting logs - * `clg` is used for performing wildcard searches on the compressed logs - -### `clp` - -To compress some logs without a schema file: -```shell -./clp c archives-dir /home/my/logs -``` -* `archives-dir` is where compressed logs should be output - * `clp` will create a number of files and directories within, so it's best if this directory is empty - * You can use the same directory repeatedly and `clp` will add to the compressed logs within. -* `/home/my/logs` is any log file or directory containing log files -* In this mode, `clp` will use heuristics to determine what are the variables in - each uncompressed message. - * The heuristics roughly correspond to the example schema file in - `config/schemas.txt`. - -To compress with a user-defined schema file: -```shell -./clp c --schema-path path-to-schema-file archives-dir /home/my/logs -``` -* `path-to-schema-file` is the location of a schema file. For more details on - schema files, see README-Schema.md. +* CLP contains three core executables: `clp`, `clg`, and `clp-s`. + * `clp` is used for compressing and extracting unstructured (plain text) logs. + * `clg` is used for performing wildcard searches on the compressed unstructured logs. + * `clp-s` is used for compressing and searching semi-structured logs (e.g., JSON) with support for + handling highly dynamic schemas. -To decompress those logs: -```shell -./clp x archive-dir decompressed -``` -* `archives-dir` is where the compressed logs were previously stored -* `decompressed` is a directory where they will be decompressed to - -You can also decompress a specific file: -```shell -./clp x archive-dir decompressed /my/file/path.log -``` -* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) - -More usage instructions can be found by running: -```shell -./clp --help -``` - -### `clg` - -To search the compressed logs: -```shell -./clg archives-dir " a *wildcard* search phrase " -``` -* `archives-dir` is where the compressed logs were previously stored -* For archives compressed without a schema file: - * The search phrase can contain the `*` wildcard which matches 0 or more - characters, or the `?` wildcard which matches any single character. -* For archives compressed using a schema file: - * `*` may only represent non-delimiter characters. - -Similar to `clp`, `clg` can search a single file: -```shell -./clg archives-dir " a *wildcard* search phrase " /my/file/path.log -``` -* `/my/file/path.log` is the uncompressed file's path (the one that was passed to `clp` for compression) - -More usage instructions can be found by running: -```shell -./clg --help -``` - -### `make-dictionaries-readable` - -If you'd like to convert the dictionaries of an individual archive into a human-readable form, you -can use `make-dictionaries-readable`. - -```shell -./make-dictionaries-readable archive-path -``` -* `archive-path` is a path to a specific archive (inside `archives-dir`) - -See the `make-dictionaries-readable` [README](src/clp/make_dictionaries_readable/README.md) for -details on the output format. - -## Parallel Compression - -By default, `clp` uses an embedded SQLite database, so each directory containing archives can only -be accessed by a single `clp` instance. - -To enable parallel compression to the same archives directory, `clp`/`clg` can be configured to -use a MySQL-type database (MariaDB) as follows: - -* Install and configure MariaDB using the instructions for your platform -* Create a user that has privileges to create databases, create tables, insert records, and delete - records. -* Copy and change `config/metadata-db.yml`, setting the type to `mysql` and uncommenting the MySQL - parameters. -* Install the MariaDB and PyYAML Python packages `pip3 install mariadb PyYAML` - * This is necessary to run the database initialization script. If you prefer, you can run the - SQL statements in `tools/scripts/db/init-db.py` directly. -* Run `tools/scripts/db/init-db.py` with the updated config file. This will initialize the - database CLP requires. -* Run `clp` or `clg` as before, with the addition of the `--db-config-file` option pointing at - the updated config file. -* To compress in parallel, simply run another instance of `clp` concurrently. - -Note that currently, decompression (`clp x`) and search (`clg`) can only be run with a single -instance. We are in the process of open-sourcing parallelized versions of these as well. +See [Using CLP for unstructured logs](../../docs/core/clp-unstructured.md) and +[Using CLP for semi-structured logs](../../docs/core/clp-structured.md) for usage instructions. diff --git a/components/core/cmake/Modules/ExternalAntlr4Cpp.cmake b/components/core/cmake/Modules/ExternalAntlr4Cpp.cmake new file mode 100644 index 000000000..9c12ee3f3 --- /dev/null +++ b/components/core/cmake/Modules/ExternalAntlr4Cpp.cmake @@ -0,0 +1,180 @@ +# NOTE: ExternalAntlr4Cpp.cmake taken from +# https://github.com/antlr/antlr4/blob/4.13.1/runtime/Cpp/cmake/ExternalAntlr4Cpp.cmake + +cmake_minimum_required(VERSION 3.7) + +if(POLICY CMP0114) + cmake_policy(SET CMP0114 NEW) +endif() + +include(ExternalProject) + +set(ANTLR4_ROOT ${CMAKE_CURRENT_BINARY_DIR}/antlr4_runtime/src/antlr4_runtime) +set(ANTLR4_INCLUDE_DIRS ${ANTLR4_ROOT}/runtime/Cpp/runtime/src) +set(ANTLR4_GIT_REPOSITORY https://github.com/antlr/antlr4.git) +if(NOT DEFINED ANTLR4_TAG) + # Set to branch name to keep library updated at the cost of needing to rebuild after 'clean' + # Set to commit hash to keep the build stable and does not need to rebuild after 'clean' + set(ANTLR4_TAG master) +endif() + +# Ensure that the include dir already exists at configure time (to avoid cmake erroring +# on non-existent include dirs) +file(MAKE_DIRECTORY "${ANTLR4_INCLUDE_DIRS}") + +if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*") + set(ANTLR4_OUTPUT_DIR ${ANTLR4_ROOT}/runtime/Cpp/dist/$(Configuration)) +elseif(${CMAKE_GENERATOR} MATCHES "Xcode.*") + set(ANTLR4_OUTPUT_DIR ${ANTLR4_ROOT}/runtime/Cpp/dist/$(CONFIGURATION)) +else() + set(ANTLR4_OUTPUT_DIR ${ANTLR4_ROOT}/runtime/Cpp/dist) +endif() + +if(MSVC) + set(ANTLR4_STATIC_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/antlr4-runtime-static.lib) + set(ANTLR4_SHARED_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/antlr4-runtime.lib) + set(ANTLR4_RUNTIME_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/antlr4-runtime.dll) +else() + set(ANTLR4_STATIC_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.a) + if(MINGW) + set(ANTLR4_SHARED_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.dll.a) + set(ANTLR4_RUNTIME_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.dll) + elseif(CYGWIN) + set(ANTLR4_SHARED_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.dll.a) + set(ANTLR4_RUNTIME_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/cygantlr4-runtime-${ANTLR4_TAG}.dll) + elseif(APPLE) + set(ANTLR4_RUNTIME_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.dylib) + else() + set(ANTLR4_RUNTIME_LIBRARIES + ${ANTLR4_OUTPUT_DIR}/libantlr4-runtime.so) + endif() +endif() + +if(${CMAKE_GENERATOR} MATCHES ".* Makefiles") + # This avoids + # 'warning: jobserver unavailable: using -j1. Add '+' to parent make rule.' + set(ANTLR4_BUILD_COMMAND $(MAKE)) +elseif(${CMAKE_GENERATOR} MATCHES "Visual Studio.*") + set(ANTLR4_BUILD_COMMAND + ${CMAKE_COMMAND} + --build . + --config $(Configuration) + --target) +elseif(${CMAKE_GENERATOR} MATCHES "Xcode.*") + set(ANTLR4_BUILD_COMMAND + ${CMAKE_COMMAND} + --build . + --config $(CONFIGURATION) + --target) +else() + set(ANTLR4_BUILD_COMMAND + ${CMAKE_COMMAND} + --build . + --target) +endif() + +if(NOT DEFINED ANTLR4_WITH_STATIC_CRT) + set(ANTLR4_WITH_STATIC_CRT ON) +endif() + +if(ANTLR4_ZIP_REPOSITORY) + ExternalProject_Add( + antlr4_runtime + PREFIX antlr4_runtime + URL ${ANTLR4_ZIP_REPOSITORY} + DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR} + BUILD_COMMAND "" + BUILD_IN_SOURCE 1 + SOURCE_DIR ${ANTLR4_ROOT} + SOURCE_SUBDIR runtime/Cpp + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DWITH_STATIC_CRT:BOOL=${ANTLR4_WITH_STATIC_CRT} + -DDISABLE_WARNINGS:BOOL=ON + # -DCMAKE_CXX_STANDARD:STRING=17 # if desired, compile the runtime with a different C++ standard + # -DCMAKE_CXX_STANDARD:STRING=${CMAKE_CXX_STANDARD} # alternatively, compile the runtime with the same C++ standard as the outer project + INSTALL_COMMAND "" + EXCLUDE_FROM_ALL 1) +else() + ExternalProject_Add( + antlr4_runtime + PREFIX antlr4_runtime + GIT_REPOSITORY ${ANTLR4_GIT_REPOSITORY} + GIT_TAG ${ANTLR4_TAG} + DOWNLOAD_DIR ${CMAKE_CURRENT_BINARY_DIR} + BUILD_COMMAND "" + BUILD_IN_SOURCE 1 + SOURCE_DIR ${ANTLR4_ROOT} + SOURCE_SUBDIR runtime/Cpp + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DWITH_STATIC_CRT:BOOL=${ANTLR4_WITH_STATIC_CRT} + -DDISABLE_WARNINGS:BOOL=ON + # -DCMAKE_CXX_STANDARD:STRING=17 # if desired, compile the runtime with a different C++ standard + # -DCMAKE_CXX_STANDARD:STRING=${CMAKE_CXX_STANDARD} # alternatively, compile the runtime with the same C++ standard as the outer project + INSTALL_COMMAND "" + EXCLUDE_FROM_ALL 1) +endif() + +# Separate build step as rarely people want both +set(ANTLR4_BUILD_DIR ${ANTLR4_ROOT}) +if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") + # CMake 3.14 builds in above's SOURCE_SUBDIR when BUILD_IN_SOURCE is true + set(ANTLR4_BUILD_DIR ${ANTLR4_ROOT}/runtime/Cpp) +endif() + +ExternalProject_Add_Step( + antlr4_runtime + build_static + COMMAND ${ANTLR4_BUILD_COMMAND} antlr4_static + # Depend on target instead of step (a custom command) + # to avoid running dependent steps concurrently + DEPENDS antlr4_runtime + BYPRODUCTS ${ANTLR4_STATIC_LIBRARIES} + EXCLUDE_FROM_MAIN 1 + WORKING_DIRECTORY ${ANTLR4_BUILD_DIR}) +ExternalProject_Add_StepTargets(antlr4_runtime build_static) + +add_library(antlr4_static STATIC IMPORTED) +add_dependencies(antlr4_static antlr4_runtime-build_static) +set_target_properties(antlr4_static PROPERTIES + IMPORTED_LOCATION ${ANTLR4_STATIC_LIBRARIES}) +target_include_directories(antlr4_static + INTERFACE + ${ANTLR4_INCLUDE_DIRS} +) + +ExternalProject_Add_Step( + antlr4_runtime + build_shared + COMMAND ${ANTLR4_BUILD_COMMAND} antlr4_shared + # Depend on target instead of step (a custom command) + # to avoid running dependent steps concurrently + DEPENDS antlr4_runtime + BYPRODUCTS ${ANTLR4_SHARED_LIBRARIES} ${ANTLR4_RUNTIME_LIBRARIES} + EXCLUDE_FROM_MAIN 1 + WORKING_DIRECTORY ${ANTLR4_BUILD_DIR}) +ExternalProject_Add_StepTargets(antlr4_runtime build_shared) + +add_library(antlr4_shared SHARED IMPORTED) +add_dependencies(antlr4_shared antlr4_runtime-build_shared) +set_target_properties(antlr4_shared PROPERTIES + IMPORTED_LOCATION ${ANTLR4_RUNTIME_LIBRARIES}) +target_include_directories(antlr4_shared + INTERFACE + ${ANTLR4_INCLUDE_DIRS} +) + +if(ANTLR4_SHARED_LIBRARIES) + set_target_properties(antlr4_shared PROPERTIES + IMPORTED_IMPLIB ${ANTLR4_SHARED_LIBRARIES}) +endif() diff --git a/components/core/cmake/Modules/FindANTLR.cmake b/components/core/cmake/Modules/FindANTLR.cmake new file mode 100644 index 000000000..d191ba071 --- /dev/null +++ b/components/core/cmake/Modules/FindANTLR.cmake @@ -0,0 +1,139 @@ +# NOTE: FindANTLR.cmake taken from +# https://github.com/antlr/antlr4/blob/4.13.1/runtime/Cpp/cmake/FindANTLR.cmake + +# TODO: Clean up ANTLR cmake files +# On macOS, the way Java is installed with brew doesn't also make it the default version of Java on +# the system. So we set JAVA_HOME to the install location here. +if (APPLE) + set(ENV{JAVA_HOME} "/usr/local/opt/openjdk@11/") +endif () + +set(ANTLR4_TAG 4.13.1) +add_definitions(-DANTLR4CPP_STATIC) +set(ANTLR_EXECUTABLE ${PROJECT_SOURCE_DIR}/third-party/antlr/antlr-${ANTLR4_TAG}-complete.jar) +include(ExternalAntlr4Cpp) + +find_package(Java 11 REQUIRED COMPONENTS Runtime) + +if(NOT ANTLR_EXECUTABLE) + find_program(ANTLR_EXECUTABLE + NAMES antlr.jar antlr4.jar antlr-4.jar antlr-${ANTLR4_TAG}-complete.jar) +endif() + +if(ANTLR_EXECUTABLE AND Java_JAVA_EXECUTABLE) + execute_process( + COMMAND ${Java_JAVA_EXECUTABLE} -jar ${ANTLR_EXECUTABLE} + OUTPUT_VARIABLE ANTLR_COMMAND_OUTPUT + ERROR_VARIABLE ANTLR_COMMAND_ERROR + RESULT_VARIABLE ANTLR_COMMAND_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(ANTLR_COMMAND_RESULT EQUAL 0) + string(REGEX MATCH "Version [0-9]+(\\.[0-9]+)*" ANTLR_VERSION ${ANTLR_COMMAND_OUTPUT}) + string(REPLACE "Version " "" ANTLR_VERSION ${ANTLR_VERSION}) + else() + message( + SEND_ERROR + "Command '${Java_JAVA_EXECUTABLE} -jar ${ANTLR_EXECUTABLE}' " + "failed with the output '${ANTLR_COMMAND_ERROR}'") + endif() + + macro(ANTLR_TARGET Name InputFile) + set(ANTLR_OPTIONS LEXER PARSER LISTENER VISITOR) + set(ANTLR_ONE_VALUE_ARGS PACKAGE OUTPUT_DIRECTORY DEPENDS_ANTLR) + set(ANTLR_MULTI_VALUE_ARGS COMPILE_FLAGS DEPENDS) + cmake_parse_arguments(ANTLR_TARGET + "${ANTLR_OPTIONS}" + "${ANTLR_ONE_VALUE_ARGS}" + "${ANTLR_MULTI_VALUE_ARGS}" + ${ARGN}) + + set(ANTLR_${Name}_INPUT ${InputFile}) + + get_filename_component(ANTLR_INPUT ${InputFile} NAME_WE) + + if(ANTLR_TARGET_OUTPUT_DIRECTORY) + set(ANTLR_${Name}_OUTPUT_DIR ${ANTLR_TARGET_OUTPUT_DIRECTORY}) + else() + set(ANTLR_${Name}_OUTPUT_DIR + ${CMAKE_CURRENT_BINARY_DIR}/antlr4cpp_generated_src/${ANTLR_INPUT}) + endif() + + unset(ANTLR_${Name}_CXX_OUTPUTS) + + if((ANTLR_TARGET_LEXER AND NOT ANTLR_TARGET_PARSER) OR + (ANTLR_TARGET_PARSER AND NOT ANTLR_TARGET_LEXER)) + list(APPEND ANTLR_${Name}_CXX_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}.cpp) + set(ANTLR_${Name}_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}.interp + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}.tokens) + else() + list(APPEND ANTLR_${Name}_CXX_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Lexer.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Lexer.cpp + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Parser.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Parser.cpp) + list(APPEND ANTLR_${Name}_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Lexer.interp + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Lexer.tokens) + endif() + + if(ANTLR_TARGET_LISTENER) + list(APPEND ANTLR_${Name}_CXX_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}BaseListener.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}BaseListener.cpp + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Listener.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Listener.cpp) + list(APPEND ANTLR_TARGET_COMPILE_FLAGS -listener) + endif() + + if(ANTLR_TARGET_VISITOR) + list(APPEND ANTLR_${Name}_CXX_OUTPUTS + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}BaseVisitor.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}BaseVisitor.cpp + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Visitor.h + ${ANTLR_${Name}_OUTPUT_DIR}/${ANTLR_INPUT}Visitor.cpp) + list(APPEND ANTLR_TARGET_COMPILE_FLAGS -visitor) + endif() + + if(ANTLR_TARGET_PACKAGE) + list(APPEND ANTLR_TARGET_COMPILE_FLAGS -package ${ANTLR_TARGET_PACKAGE}) + endif() + + list(APPEND ANTLR_${Name}_OUTPUTS ${ANTLR_${Name}_CXX_OUTPUTS}) + + if(ANTLR_TARGET_DEPENDS_ANTLR) + if(ANTLR_${ANTLR_TARGET_DEPENDS_ANTLR}_INPUT) + list(APPEND ANTLR_TARGET_DEPENDS + ${ANTLR_${ANTLR_TARGET_DEPENDS_ANTLR}_INPUT}) + list(APPEND ANTLR_TARGET_DEPENDS + ${ANTLR_${ANTLR_TARGET_DEPENDS_ANTLR}_OUTPUTS}) + else() + message(SEND_ERROR + "ANTLR target '${ANTLR_TARGET_DEPENDS_ANTLR}' not found") + endif() + endif() + + add_custom_command( + OUTPUT ${ANTLR_${Name}_OUTPUTS} + COMMAND ${Java_JAVA_EXECUTABLE} -jar ${ANTLR_EXECUTABLE} + ${InputFile} + -o ${ANTLR_${Name}_OUTPUT_DIR} + -no-listener + -Dlanguage=Cpp + ${ANTLR_TARGET_COMPILE_FLAGS} + DEPENDS ${InputFile} + ${ANTLR_TARGET_DEPENDS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Building ${Name} with ANTLR ${ANTLR_VERSION}") + endmacro(ANTLR_TARGET) + +endif(ANTLR_EXECUTABLE AND Java_JAVA_EXECUTABLE) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + ANTLR + REQUIRED_VARS ANTLR_EXECUTABLE Java_JAVA_EXECUTABLE + VERSION_VAR ANTLR_VERSION) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp new file mode 100644 index 000000000..c716969a4 --- /dev/null +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -0,0 +1,82 @@ +#include "ArchiveReader.hpp" + +#include "ReaderUtils.hpp" + +namespace clp_s { +void ArchiveReader::open(ArchiveReaderOption& option) { + // Open dictionary readers + m_archive_path = option.archive_path; + + m_var_dict = ReaderUtils::get_variable_dictionary_reader(m_archive_path); + m_log_dict = ReaderUtils::get_log_type_dictionary_reader(m_archive_path); + m_array_dict = ReaderUtils::get_array_dictionary_reader(m_archive_path); + + m_var_dict->read_new_entries(); + m_log_dict->read_new_entries(); + m_array_dict->read_new_entries(); + + std::string encoded_messages_dir = m_archive_path + "/encoded_messages"; + if (false == boost::filesystem::exists(encoded_messages_dir)) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + + std::set schema_ids; + boost::filesystem::directory_iterator iter(encoded_messages_dir); + boost::filesystem::directory_iterator end; + + // Get all schema ids + for (; iter != end; ++iter) { + if (boost::filesystem::is_regular_file(iter->path())) { + std::string path = iter->path().rbegin()->string(); + if (false == path.empty() && std::all_of(path.begin(), path.end(), ::isdigit)) { + schema_ids.insert(std::stoi(path)); + } + } + } + + if (schema_ids.empty()) { + throw OperationFailed(ErrorCodeFileNotFound, __FILENAME__, __LINE__); + } + + // Open schema readers and load encoded messages + for (int32_t schema_id : schema_ids) { + auto& schema = m_id_to_schema[schema_id]; + auto schema_reader = new SchemaReader(m_schema_tree, schema_id); + schema_reader->open(encoded_messages_dir + "/" + std::to_string(schema_id)); + + ReaderUtils::append_reader_columns( + schema_reader, + schema, + m_schema_tree, + m_var_dict, + m_log_dict, + m_array_dict, + m_timestamp_dict + ); + + schema_reader->load(); + m_schema_id_to_reader[schema_id] = schema_reader; + } +} + +void ArchiveReader::store(FileWriter& writer) { + std::string message; + for (auto& i : m_schema_id_to_reader) { + while (i.second->get_next_message(message)) { + writer.write(message.c_str(), message.length()); + } + } +} + +void ArchiveReader::close() { + m_var_dict->close(); + m_log_dict->close(); + + for (auto& i : m_schema_id_to_reader) { + i.second->close(); + delete i.second; + } + + m_schema_id_to_reader.clear(); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ArchiveReader.hpp b/components/core/src/clp_s/ArchiveReader.hpp new file mode 100644 index 000000000..7df8a01ef --- /dev/null +++ b/components/core/src/clp_s/ArchiveReader.hpp @@ -0,0 +1,71 @@ +#ifndef CLP_S_ARCHIVEREADER_HPP +#define CLP_S_ARCHIVEREADER_HPP + +#include +#include +#include + +#include + +#include "DictionaryReader.hpp" +#include "SchemaReader.hpp" +#include "TimestampDictionaryReader.hpp" + +namespace clp_s { +struct ArchiveReaderOption { + std::string archive_path; + std::map> id_to_schema; +}; + +class ArchiveReader { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + ArchiveReader( + std::shared_ptr schema_tree, + std::map> id_to_schema, + std::shared_ptr timestamp_dict + ) + : m_schema_tree(std::move(schema_tree)), + m_id_to_schema(std::move(id_to_schema)), + m_timestamp_dict(std::move(timestamp_dict)) {} + + /** + * Opens an archive for reading. + * @param option + */ + void open(ArchiveReaderOption& option); + + /** + * Writes decoded messages to a file. + * @param writer + */ + void store(FileWriter& writer); + + /** + * Closes the archive. + */ + void close(); + +private: + std::string m_archive_path; + + std::shared_ptr m_var_dict; + std::shared_ptr m_log_dict; + std::shared_ptr m_array_dict; + + std::shared_ptr m_schema_tree; + std::map> m_id_to_schema; + std::map m_schema_id_to_reader; + + std::shared_ptr m_timestamp_dict; +}; +} // namespace clp_s + +#endif // CLP_S_ARCHIVEREADER_HPP diff --git a/components/core/src/clp_s/ArchiveWriter.cpp b/components/core/src/clp_s/ArchiveWriter.cpp new file mode 100644 index 000000000..52eee8a0d --- /dev/null +++ b/components/core/src/clp_s/ArchiveWriter.cpp @@ -0,0 +1,124 @@ +#include "ArchiveWriter.hpp" + +#include "SchemaTree.hpp" + +namespace clp_s { +void ArchiveWriter::open(ArchiveWriterOption const& option) { + m_id = option.id; + m_compression_level = option.compression_level; + auto archive_path + = boost::filesystem::path(option.archives_dir) / boost::uuids::to_string(m_id); + + boost::system::error_code boost_error_code; + bool path_exists = boost::filesystem::exists(archive_path, boost_error_code); + if (path_exists) { + SPDLOG_ERROR("Archive path already exists: {}", archive_path.c_str()); + throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__); + } + + m_archive_path = archive_path.string(); + if (false == boost::filesystem::create_directory(m_archive_path)) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + + m_encoded_messages_dir = m_archive_path + "/encoded_messages"; + if (false == boost::filesystem::create_directory(m_encoded_messages_dir)) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + + std::string var_dict_path = m_archive_path + "/var.dict"; + m_var_dict = std::make_shared(); + m_var_dict->open(var_dict_path, m_compression_level, UINT64_MAX); + + std::string log_dict_path = m_archive_path + "/log.dict"; + m_log_dict = std::make_shared(); + m_log_dict->open(log_dict_path, m_compression_level, UINT64_MAX); + + std::string array_dict_path = m_archive_path + "/array.dict"; + m_array_dict = std::make_shared(); + m_array_dict->open(array_dict_path, m_compression_level, UINT64_MAX); + + std::string timestamp_local_dict_path = m_archive_path + "/timestamp.dict"; + m_timestamp_dict->open_local(timestamp_local_dict_path, m_compression_level); +} + +void ArchiveWriter::close() { + m_var_dict->close(); + m_log_dict->close(); + m_array_dict->close(); + m_timestamp_dict->close_local(); + + for (auto& i : m_schema_id_to_writer) { + i.second->store(); + i.second->close(); + delete i.second; + } + + m_schema_id_to_writer.clear(); + m_encoded_message_size = 0UL; +} + +void ArchiveWriter::append_message( + int32_t schema_id, + std::set& schema, + ParsedMessage& message +) { + SchemaWriter* schema_writer; + auto it = m_schema_id_to_writer.find(schema_id); + if (it != m_schema_id_to_writer.end()) { + schema_writer = it->second; + } else { + schema_writer = new SchemaWriter(); + schema_writer->open( + m_encoded_messages_dir + "/" + std::to_string(schema_id), + m_compression_level + ); + initialize_schema_writer(schema_writer, schema); + m_schema_id_to_writer[schema_id] = schema_writer; + } + + m_encoded_message_size += schema_writer->append_message(message); +} + +size_t ArchiveWriter::get_data_size() { + return m_log_dict->get_data_size() + m_var_dict->get_data_size() + m_array_dict->get_data_size() + + m_encoded_message_size; +} + +void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, std::set& schema) { + for (int32_t id : schema) { + auto node = m_schema_tree->get_node(id); + std::string key_name = node->get_key_name(); + switch (node->get_type()) { + case NodeType::INTEGER: + writer->append_column(new Int64ColumnWriter(key_name)); + break; + case NodeType::FLOAT: + writer->append_column(new FloatColumnWriter(key_name)); + break; + case NodeType::CLPSTRING: + writer->append_column(new ClpStringColumnWriter(key_name, m_var_dict, m_log_dict)); + break; + case NodeType::VARSTRING: + writer->append_column(new VariableStringColumnWriter(key_name, m_var_dict)); + break; + case NodeType::BOOLEAN: + writer->append_column(new BooleanColumnWriter(key_name)); + break; + case NodeType::ARRAY: + writer->append_column(new ClpStringColumnWriter(key_name, m_var_dict, m_array_dict) + ); + break; + case NodeType::DATESTRING: + writer->append_column(new DateStringColumnWriter(key_name, m_timestamp_dict)); + break; + case NodeType::FLOATDATESTRING: + writer->append_column(new FloatDateStringColumnWriter(key_name, m_timestamp_dict)); + break; + case NodeType::OBJECT: + case NodeType::NULLVALUE: + break; + } + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp new file mode 100644 index 000000000..9c3b7db37 --- /dev/null +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -0,0 +1,94 @@ +#ifndef CLP_S_ARCHIVEWRITER_HPP +#define CLP_S_ARCHIVEWRITER_HPP + +#include +#include + +#include +#include +#include + +#include "DictionaryWriter.hpp" +#include "SchemaTree.hpp" +#include "SchemaWriter.hpp" +#include "TimestampDictionaryWriter.hpp" + +namespace clp_s { +struct ArchiveWriterOption { + boost::uuids::uuid id; + std::string archives_dir; + int compression_level; +}; + +class ArchiveWriter { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Delete default constructor + ArchiveWriter() = delete; + + // Constructor + explicit ArchiveWriter( + std::shared_ptr schema_tree, + std::shared_ptr timestamp_dict + ) + : m_encoded_message_size(0UL), + m_schema_tree(std::move(schema_tree)), + m_timestamp_dict(std::move(timestamp_dict)) {} + + /** + * Opens the archive writer + * @param option + */ + void open(ArchiveWriterOption const& option); + + /** + * Closes the archive writer + */ + void close(); + + /** + * Appends a message to the archive writer + * @param schema_id + * @param schema + * @param message + */ + void append_message(int32_t schema_id, std::set& schema, ParsedMessage& message); + + /** + * @return Size of the uncompressed data written to the archive + */ + size_t get_data_size(); + +private: + /** + * Initializes the schema writer + * @param writer + * @param schema + */ + void initialize_schema_writer(SchemaWriter* writer, std::set& schema); + + size_t m_encoded_message_size; + + boost::uuids::uuid m_id{}; + + std::string m_archive_path; + std::string m_encoded_messages_dir; + + std::shared_ptr m_var_dict; + std::shared_ptr m_log_dict; + std::shared_ptr m_array_dict; // log type dictionary for arrays + std::shared_ptr m_timestamp_dict; + int m_compression_level{}; + + std::shared_ptr m_schema_tree; + std::map m_schema_id_to_writer; +}; +} // namespace clp_s + +#endif // CLP_S_ARCHIVEWRITER_HPP diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt new file mode 100644 index 000000000..325af0334 --- /dev/null +++ b/components/core/src/clp_s/CMakeLists.txt @@ -0,0 +1,137 @@ +add_subdirectory(search/kql) + +set( + CLP_S_SOURCES + "${PROJECT_SOURCE_DIR}/submodules/date/include/date/date.h" + ArchiveReader.cpp + ArchiveReader.hpp + ArchiveWriter.cpp + ArchiveWriter.hpp + ColumnReader.cpp + ColumnReader.hpp + ColumnWriter.cpp + ColumnWriter.hpp + CommandLineArguments.cpp + CommandLineArguments.hpp + Compressor.hpp + Decompressor.hpp + Defs.hpp + DictionaryEntry.cpp + DictionaryEntry.hpp + DictionaryReader.hpp + DictionaryWriter.cpp + DictionaryWriter.hpp + ErrorCode.hpp + FileReader.cpp + FileReader.hpp + FileWriter.cpp + FileWriter.hpp + JsonConstructor.cpp + JsonConstructor.hpp + JsonFileIterator.cpp + JsonFileIterator.hpp + JsonParser.cpp + JsonParser.hpp + JsonSerializer.hpp + ParsedMessage.hpp + ReaderUtils.cpp + ReaderUtils.hpp + SchemaMap.cpp + SchemaMap.hpp + SchemaReader.cpp + SchemaReader.hpp + SchemaTree.cpp + SchemaTree.hpp + SchemaWriter.cpp + SchemaWriter.hpp + TimestampDictionaryReader.cpp + TimestampDictionaryReader.hpp + TimestampDictionaryWriter.cpp + TimestampDictionaryWriter.hpp + TimestampEntry.cpp + TimestampEntry.hpp + TimestampPattern.cpp + TimestampPattern.hpp + TraceableException.hpp + Utils.cpp + Utils.hpp + VariableDecoder.cpp + VariableDecoder.hpp + VariableEncoder.cpp + VariableEncoder.hpp + ZstdCompressor.cpp + ZstdCompressor.hpp + ZstdDecompressor.cpp + ZstdDecompressor.hpp +) + +set( + CLP_S_SEARCH_SOURCES + search/AndExpr.cpp + search/AndExpr.hpp + search/BooleanLiteral.cpp + search/BooleanLiteral.hpp + search/clp_search/EncodedVariableInterpreter.cpp + search/clp_search/EncodedVariableInterpreter.hpp + search/clp_search/Grep.cpp + search/clp_search/Grep.hpp + search/clp_search/Query.cpp + search/clp_search/Query.hpp + search/ColumnDescriptor.cpp + search/ColumnDescriptor.hpp + search/ConstantProp.cpp + search/ConstantProp.hpp + search/ConvertToExists.cpp + search/ConvertToExists.hpp + search/DateLiteral.cpp + search/DateLiteral.hpp + search/EmptyExpr.cpp + search/EmptyExpr.hpp + search/EvaluateTimestampIndex.cpp + search/EvaluateTimestampIndex.hpp + search/Expression.cpp + search/Expression.hpp + search/FilterExpr.cpp + search/FilterExpr.hpp + search/FilterOperation.hpp + search/Integral.cpp + search/Integral.hpp + search/Literal.hpp + search/NarrowTypes.cpp + search/NarrowTypes.hpp + search/NullLiteral.cpp + search/NullLiteral.hpp + search/OrExpr.cpp + search/OrExpr.hpp + search/OrOfAndForm.cpp + search/OrOfAndForm.hpp + search/Output.cpp + search/Output.hpp + search/SchemaMatch.cpp + search/SchemaMatch.hpp + search/SearchUtils.cpp + search/SearchUtils.hpp + search/StringLiteral.cpp + search/StringLiteral.hpp + search/Transformation.hpp + search/Value.hpp +) + +add_executable(clp-s clp-s.cpp ${CLP_S_SOURCES} ${CLP_S_SEARCH_SOURCES}) +target_compile_features(clp-s PRIVATE cxx_std_17) +target_link_libraries( + clp-s + PRIVATE + absl::flat_hash_map + Boost::filesystem Boost::iostreams Boost::program_options + kql + simdjson + spdlog::spdlog + ZStd::ZStd +) +target_include_directories(clp-s PRIVATE "${PROJECT_SOURCE_DIR}/submodules") +set_target_properties( + clp-s + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}" +) diff --git a/components/core/src/clp_s/ColumnReader.cpp b/components/core/src/clp_s/ColumnReader.cpp new file mode 100644 index 000000000..c45104422 --- /dev/null +++ b/components/core/src/clp_s/ColumnReader.cpp @@ -0,0 +1,177 @@ +#include "ColumnReader.hpp" + +#include "ColumnWriter.hpp" +#include "Utils.hpp" +#include "VariableDecoder.hpp" + +namespace clp_s { +void Int64ColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_values = std::make_unique(num_messages); + + decompressor.try_read_exact_length( + reinterpret_cast(m_values.get()), + num_messages * sizeof(int64_t) + ); +} + +std::variant Int64ColumnReader::extract_value( + uint64_t cur_message +) { + return m_values[cur_message]; +} + +void FloatColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_values = std::make_unique(num_messages); + + decompressor.try_read_exact_length( + reinterpret_cast(m_values.get()), + num_messages * sizeof(double) + ); +} + +std::variant FloatColumnReader::extract_value( + uint64_t cur_message +) { + return m_values[cur_message]; +} + +void BooleanColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_values = std::make_unique(num_messages); + + decompressor.try_read_exact_length( + reinterpret_cast(m_values.get()), + num_messages * sizeof(uint8_t) + ); +} + +std::variant BooleanColumnReader::extract_value( + uint64_t cur_message +) { + return m_values[cur_message]; +} + +void ClpStringColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + size_t encoded_vars_length; + + m_logtypes = std::make_unique(num_messages); + decompressor.try_read_exact_length( + reinterpret_cast(m_logtypes.get()), + num_messages * sizeof(int64_t) + ); + + auto error_code = decompressor.try_read_numeric_value(encoded_vars_length); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + m_encoded_vars = std::make_unique(encoded_vars_length); + decompressor.try_read_exact_length( + reinterpret_cast(m_encoded_vars.get()), + encoded_vars_length * sizeof(int64_t) + ); +} + +std::variant ClpStringColumnReader::extract_value( + uint64_t cur_message +) { + std::string message; + + auto value = m_logtypes[cur_message]; + int64_t logtype_id = ClpStringColumnWriter::get_encoded_log_dict_id(value); + auto& entry = m_log_dict->get_entry(logtype_id); + + if (false == entry.initialized()) { + entry.decode_log_type(); + } + + int64_t encoded_vars_offset = ClpStringColumnWriter::get_encoded_offset(value); + Span encoded_vars(&m_encoded_vars[encoded_vars_offset], entry.get_num_vars()); + + VariableDecoder::decode_variables_into_message(entry, *m_var_dict, encoded_vars, message); + + return message; +} + +int64_t ClpStringColumnReader::get_encoded_id(uint64_t cur_message) { + auto value = m_logtypes[cur_message]; + return ClpStringColumnWriter::get_encoded_log_dict_id(value); +} + +Span ClpStringColumnReader::get_encoded_vars(uint64_t cur_message) { + auto value = m_logtypes[cur_message]; + int64_t logtype_id = ClpStringColumnWriter::get_encoded_log_dict_id(value); + auto& entry = m_log_dict->get_entry(logtype_id); + + // It should be initialized before because we are searching on this field + if (false == entry.initialized()) { + entry.decode_log_type(); + } + + int64_t encoded_vars_offset = ClpStringColumnWriter::get_encoded_offset(value); + + return {&m_encoded_vars[encoded_vars_offset], entry.get_num_vars()}; +} + +void VariableStringColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_variables = std::make_unique(num_messages); + decompressor.try_read_exact_length( + reinterpret_cast(m_variables.get()), + num_messages * sizeof(int64_t) + ); +} + +std::variant VariableStringColumnReader::extract_value( + uint64_t cur_message +) { + return m_var_dict->get_value(m_variables[cur_message]); +} + +int64_t VariableStringColumnReader::get_variable_id(uint64_t cur_message) { + return m_variables[cur_message]; +} + +void DateStringColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_timestamps = std::make_unique(num_messages); + m_timestamp_encodings = std::make_unique(num_messages); + + decompressor.try_read_exact_length( + reinterpret_cast(m_timestamps.get()), + num_messages * sizeof(int64_t) + ); + decompressor.try_read_exact_length( + reinterpret_cast(m_timestamp_encodings.get()), + num_messages * sizeof(int64_t) + ); +} + +std::variant DateStringColumnReader::extract_value( + uint64_t cur_message +) { + return m_timestamp_dict->get_string_encoding( + m_timestamps[cur_message], + m_timestamp_encodings[cur_message] + ); +} + +epochtime_t DateStringColumnReader::get_encoded_time(uint64_t cur_message) { + return m_timestamps[cur_message]; +} + +void FloatDateStringColumnReader::load(ZstdDecompressor& decompressor, uint64_t num_messages) { + m_timestamps = std::make_unique(num_messages); + decompressor.try_read_exact_length( + reinterpret_cast(m_timestamps.get()), + num_messages * sizeof(double) + ); +} + +std::variant FloatDateStringColumnReader::extract_value( + uint64_t cur_message +) { + return std::to_string(m_timestamps[cur_message]); +} + +double FloatDateStringColumnReader::get_encoded_time(uint64_t cur_message) { + return m_timestamps[cur_message]; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ColumnReader.hpp b/components/core/src/clp_s/ColumnReader.hpp new file mode 100644 index 000000000..0b3d86a65 --- /dev/null +++ b/components/core/src/clp_s/ColumnReader.hpp @@ -0,0 +1,265 @@ +#ifndef CLP_S_COLUMNREADER_HPP +#define CLP_S_COLUMNREADER_HPP + +#include +#include + +#include "DictionaryReader.hpp" +#include "TimestampDictionaryReader.hpp" +#include "Utils.hpp" +#include "ZstdDecompressor.hpp" + +namespace clp_s { +class BaseColumnReader { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + BaseColumnReader(std::string name, int32_t id) : m_name(std::move(name)), m_id(id) {} + + // Destructor + virtual ~BaseColumnReader() = default; + + /** + * Reads the column from the disk + * @param decompressor + * @param num_messages + */ + virtual void load(ZstdDecompressor& decompressor, uint64_t num_messages) = 0; + + std::string get_name() const { return m_name; } + + int32_t get_id() const { return m_id; } + + virtual std::string get_type() { return "base"; } + + /** + * Extracts a value of the column + * @param cur_message + * @return Value + */ + virtual std::variant extract_value(uint64_t cur_message) + = 0; + +private: + std::string m_name; + int32_t m_id; +}; + +class Int64ColumnReader : public BaseColumnReader { +public: + // Constructor + explicit Int64ColumnReader(std::string name, int32_t id) + : BaseColumnReader(std::move(name), id) {} + + // Destructor + ~Int64ColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "int"; } + + std::variant extract_value(uint64_t cur_message + ) override; + +private: + std::unique_ptr m_values; +}; + +class FloatColumnReader : public BaseColumnReader { +public: + // Constructor + explicit FloatColumnReader(std::string name, int32_t id) + : BaseColumnReader(std::move(name), id) {} + + // Destructor + ~FloatColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "float"; } + + std::variant extract_value(uint64_t cur_message + ) override; + +private: + std::unique_ptr m_values; +}; + +class BooleanColumnReader : public BaseColumnReader { +public: + // Constructor + explicit BooleanColumnReader(std::string name, int32_t id) + : BaseColumnReader(std::move(name), id) {} + + // Destructor + ~BooleanColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "bool"; } + + std::variant extract_value(uint64_t cur_message + ) override; + +private: + std::unique_ptr m_values; +}; + +class ClpStringColumnReader : public BaseColumnReader { +public: + // Constructor + ClpStringColumnReader( + std::string const& name, + int32_t id, + std::shared_ptr var_dict, + std::shared_ptr log_dict, + bool is_array = false + ) + : BaseColumnReader(name, id), + m_var_dict(std::move(var_dict)), + m_log_dict(std::move(log_dict)), + m_is_array(is_array) /*, encoded_vars_index_(0)*/ {} + + // Destructor + ~ClpStringColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return m_is_array ? "array" : "string"; } + + std::variant extract_value(uint64_t cur_message + ) override; + + /** + * Gets the encoded id of the variable + * @param cur_message + * @return The encoded logtype id + */ + int64_t get_encoded_id(uint64_t cur_message); + + /** + * Gets the encoded variables + * @param cur_message + * @return Encoded variables in a span + */ + Span get_encoded_vars(uint64_t cur_message); + +private: + std::shared_ptr m_var_dict; + std::shared_ptr m_log_dict; + + std::unique_ptr m_logtypes; + std::unique_ptr m_encoded_vars; + // size_t encoded_vars_index_; + + bool m_is_array; +}; + +class VariableStringColumnReader : public BaseColumnReader { +public: + // Constructor + VariableStringColumnReader( + std::string const& name, + int32_t id, + std::shared_ptr var_dict + ) + : BaseColumnReader(name, id), + m_var_dict(std::move(var_dict)) {} + + // Destructor + ~VariableStringColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "string"; } + + std::variant extract_value(uint64_t cur_message + ) override; + + /** + * Gets the encoded id of the variable + * @param cur_message + * @return The encoded logtype id + */ + int64_t get_variable_id(uint64_t cur_message); + +private: + std::shared_ptr m_var_dict; + + std::unique_ptr m_variables; +}; + +class DateStringColumnReader : public BaseColumnReader { +public: + // Constructor + DateStringColumnReader( + std::string const& name, + int32_t id, + std::shared_ptr timestamp_dict + ) + : BaseColumnReader(name, id), + m_timestamp_dict(std::move(timestamp_dict)) {} + + // Destructor + ~DateStringColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "string"; } + + std::variant extract_value(uint64_t cur_message + ) override; + + /** + * @param cur_message + * @return The encoded time in epoch time + */ + epochtime_t get_encoded_time(uint64_t cur_message); + +private: + std::shared_ptr m_timestamp_dict; + + std::unique_ptr m_timestamps; + std::unique_ptr m_timestamp_encodings; +}; + +class FloatDateStringColumnReader : public BaseColumnReader { +public: + // Constructor + FloatDateStringColumnReader(std::string const& name, int32_t id) : BaseColumnReader(name, id) {} + + // Destructor + ~FloatDateStringColumnReader() override = default; + + // Methods inherited from BaseColumnReader + void load(ZstdDecompressor& decompressor, uint64_t num_messages) override; + + std::string get_type() override { return "string"; } + + std::variant extract_value(uint64_t cur_message + ) override; + + /** + * @param cur_message + * @return The encoded time in float epoch time + */ + double get_encoded_time(uint64_t cur_message); + +private: + std::unique_ptr m_timestamps; +}; +} // namespace clp_s + +#endif // CLP_S_COLUMNREADER_HPP diff --git a/components/core/src/clp_s/ColumnWriter.cpp b/components/core/src/clp_s/ColumnWriter.cpp new file mode 100644 index 000000000..6abe9c302 --- /dev/null +++ b/components/core/src/clp_s/ColumnWriter.cpp @@ -0,0 +1,142 @@ +#include "ColumnWriter.hpp" + +namespace clp_s { +void Int64ColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(int64_t); + m_values.push_back(std::get(value)); +} + +void Int64ColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_values.data()), + m_values.size() * sizeof(int64_t) + ); +} + +void FloatColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(double); + m_values.push_back(std::get(value)); +} + +void FloatColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_values.data()), + m_values.size() * sizeof(double) + ); +} + +void BooleanColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(uint8_t); + m_values.push_back(std::get(value) ? 1 : 0); +} + +void BooleanColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_values.data()), + m_values.size() * sizeof(uint8_t) + ); +} + +void ClpStringColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(int64_t); + std::string string_var = std::get(value); + uint64_t id; + uint64_t offset = m_encoded_vars.size(); + VariableEncoder::encode_and_add_to_dictionary( + string_var, + m_logtype_entry, + *m_var_dict, + m_encoded_vars + ); + m_log_dict->add_entry(m_logtype_entry, id); + auto encoded_id = encode_log_dict_id(id, offset); + m_logtypes.push_back(encoded_id); + size += sizeof(int64_t) * (m_encoded_vars.size() - offset); +} + +void ClpStringColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_logtypes.data()), + m_logtypes.size() * sizeof(int64_t) + ); + compressor.write_numeric_value(m_encoded_vars.size()); + compressor.write( + reinterpret_cast(m_encoded_vars.data()), + m_encoded_vars.size() * sizeof(int64_t) + ); +} + +void VariableStringColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(int64_t); + std::string string_var = std::get(value); + uint64_t id; + m_var_dict->add_entry(string_var, id); + m_variables.push_back(id); +} + +void VariableStringColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_variables.data()), + m_variables.size() * sizeof(int64_t) + ); +} + +void DateStringColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = 2 * sizeof(int64_t); + std::string string_timestamp = std::get(value); + + uint64_t encoding_id; + epochtime_t timestamp = m_timestamp_dict->ingest_entry(m_name, string_timestamp, encoding_id); + + m_timestamps.push_back(timestamp); + m_timestamp_encodings.push_back(encoding_id); +} + +void DateStringColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_timestamps.data()), + m_timestamps.size() * sizeof(int64_t) + ); + compressor.write( + reinterpret_cast(m_timestamp_encodings.data()), + m_timestamp_encodings.size() * sizeof(int64_t) + ); +} + +void FloatDateStringColumnWriter::add_value( + std::variant& value, + size_t& size +) { + size = sizeof(double); + double timestamp = std::get(value); + + m_timestamp_dict->ingest_entry(m_name, timestamp); + + m_timestamps.push_back(timestamp); +} + +void FloatDateStringColumnWriter::store(ZstdCompressor& compressor) { + compressor.write( + reinterpret_cast(m_timestamps.data()), + m_timestamps.size() * sizeof(double) + ); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ColumnWriter.hpp b/components/core/src/clp_s/ColumnWriter.hpp new file mode 100644 index 000000000..447f0adc3 --- /dev/null +++ b/components/core/src/clp_s/ColumnWriter.hpp @@ -0,0 +1,232 @@ +#ifndef CLP_S_COLUMNWRITER_HPP +#define CLP_S_COLUMNWRITER_HPP + +#include +#include + +#include + +#include "DictionaryWriter.hpp" +#include "FileWriter.hpp" +#include "TimestampDictionaryWriter.hpp" +#include "VariableEncoder.hpp" +#include "ZstdCompressor.hpp" + +using namespace simdjson; + +namespace clp_s { +class BaseColumnWriter { +public: + // Constructor + explicit BaseColumnWriter(std::string name) : m_name(std::move(name)) {} + + // Destructor + virtual ~BaseColumnWriter() = default; + + /** + * Adds a value to the column + * @param value + * @param size + */ + virtual void add_value(std::variant& value, size_t& size) + = 0; + + /** + * Stores the column to a compressed file + * @param compressor + */ + virtual void store(ZstdCompressor& compressor) = 0; + + /** + * @return Name of the column + */ + std::string get_name() { return m_name; } + +protected: + std::string m_name; +}; + +class Int64ColumnWriter : public BaseColumnWriter { +public: + // Constructor + explicit Int64ColumnWriter(std::string name) : BaseColumnWriter(std::move(name)) {} + + // Destructor + ~Int64ColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::vector m_values; +}; + +class FloatColumnWriter : public BaseColumnWriter { +public: + // Constructor + explicit FloatColumnWriter(std::string name) : BaseColumnWriter(std::move(name)) {} + + // Destructor + ~FloatColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::vector m_values; +}; + +class BooleanColumnWriter : public BaseColumnWriter { +public: + // Constructor + explicit BooleanColumnWriter(std::string name) : BaseColumnWriter(std::move(name)) {} + + // Destructor + ~BooleanColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::vector m_values; +}; + +class ClpStringColumnWriter : public BaseColumnWriter { +public: + // Constructor + ClpStringColumnWriter( + std::string const& name, + std::shared_ptr var_dict, + std::shared_ptr log_dict + ) + : BaseColumnWriter(name), + m_var_dict(std::move(var_dict)), + m_log_dict(std::move(log_dict)) {} + + // Destructor + ~ClpStringColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + + /** + * @param encoded_id + * @return the encoded log dict id + */ + static int64_t get_encoded_log_dict_id(uint64_t encoded_id) { + return (int64_t)encoded_id & cLogDictIdMask; + } + + /** + * @param encoded_id + * @return The encoded offset + */ + static int64_t get_encoded_offset(uint64_t encoded_id) { + return ((int64_t)encoded_id & cOffsetMask) >> cOffsetBitPosition; + } + +private: + /** + * Encodes a log dict id + * @param id + * @param offset + * @return The encoded log dict id + */ + static int64_t encode_log_dict_id(uint64_t id, uint64_t offset) { + return ((int64_t)id) | ((int64_t)offset) << cOffsetBitPosition; + } + + static constexpr int cOffsetBitPosition = 24; + static constexpr int64_t cLogDictIdMask = ~(-1ULL << cOffsetBitPosition); + static constexpr int64_t cOffsetMask = ~cLogDictIdMask; + + std::shared_ptr m_var_dict; + std::shared_ptr m_log_dict; + LogTypeDictionaryEntry m_logtype_entry; + + std::vector m_logtypes; + std::vector m_encoded_vars; +}; + +class VariableStringColumnWriter : public BaseColumnWriter { +public: + // Constructor + VariableStringColumnWriter( + std::string const& name, + std::shared_ptr var_dict + ) + : BaseColumnWriter(name), + m_var_dict(std::move(var_dict)) {} + + // Destructor + ~VariableStringColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::shared_ptr m_var_dict; + std::vector m_variables; +}; + +class DateStringColumnWriter : public BaseColumnWriter { +public: + // Constructor + DateStringColumnWriter( + std::string const& name, + std::shared_ptr timestamp_dict + ) + : BaseColumnWriter(name), + m_timestamp_dict(std::move(timestamp_dict)) {} + + // Destructor + ~DateStringColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::shared_ptr m_timestamp_dict; + + std::vector m_timestamps; + std::vector m_timestamp_encodings; +}; + +class FloatDateStringColumnWriter : public BaseColumnWriter { +public: + // Constructor + FloatDateStringColumnWriter( + std::string const& name, + std::shared_ptr timestamp_dict + ) + : BaseColumnWriter(name), + m_timestamp_dict(std::move(timestamp_dict)) {} + + // Destructor + ~FloatDateStringColumnWriter() override = default; + + // Methods inherited from BaseColumnWriter + void add_value(std::variant& value, size_t& size) override; + + void store(ZstdCompressor& compressor) override; + +private: + std::shared_ptr m_timestamp_dict; + + std::vector m_timestamps; +}; +} // namespace clp_s + +#endif // CLP_S_COLUMNWRITER_HPP diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp new file mode 100644 index 000000000..cecf51f91 --- /dev/null +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -0,0 +1,298 @@ +#include "CommandLineArguments.hpp" + +#include + +#include +#include + +namespace po = boost::program_options; + +namespace clp_s { +CommandLineArguments::ParsingResult +CommandLineArguments::parse_arguments(int argc, char const** argv) { + if (1 == argc) { + print_basic_usage(); + return ParsingResult::Failure; + } + + po::options_description general_options("General options"); + general_options.add_options()("help,h", "Print help"); + + char command_input; + po::options_description general_positional_options("General positional options"); + // clang-format off + general_positional_options.add_options()( + "command", po::value(&command_input) + )( + "command-args", po::value>() + ); + // clang-format on + + po::positional_options_description general_positional_options_description; + general_positional_options_description.add("command", 1); + general_positional_options_description.add("command-args", -1); + + po::options_description all_descriptions; + all_descriptions.add(general_options); + all_descriptions.add(general_positional_options); + + try { + po::variables_map parsed_command_line_options; + po::parsed_options parsed = po::command_line_parser(argc, argv) + .options(all_descriptions) + .positional(general_positional_options_description) + .allow_unregistered() + .run(); + po::store(parsed, parsed_command_line_options); + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("command") == 0) { + if (parsed_command_line_options.count("help") != 0) { + if (argc > 2) { + SPDLOG_WARN("Ignoring all options besides --help."); + } + + print_basic_usage(); + std::cerr << "COMMAND is one of:" << std::endl; + std::cerr << " c - compress" << std::endl; + std::cerr << " x - decompress" << std::endl; + std::cerr << " s - search" << std::endl; + std::cerr << std::endl; + std::cerr << "Try " + << " c --help OR" + << " x --help OR" + << " s --help for command-specific details." << std::endl; + + po::options_description visible_options; + visible_options.add(general_options); + std::cerr << visible_options << '\n'; + return ParsingResult::InfoCommand; + } + + throw std::invalid_argument("Command unspecified"); + } + + switch (command_input) { + case (char)Command::Compress: + case (char)Command::Extract: + case (char)Command::Search: + m_command = (Command)command_input; + break; + default: + throw std::invalid_argument(std::string("Unknown action '") + command_input + "'"); + } + + if (Command::Compress == m_command) { + po::options_description compression_positional_options; + // clang-format off + compression_positional_options.add_options()( + "archives-dir", + po::value(&m_archives_dir)->value_name("DIR"), + "output directory" + )( + "input-paths", + po::value>(&m_file_paths)->value_name("PATHS"), + "input paths" + ); + // clang-format on + + po::options_description compression_options("Compression options"); + // clang-format off + compression_options.add_options()( + "compression-level", + po::value(&m_compression_level)->value_name("LEVEL")->default_value(3), + "1 (fast/low compression) to 9 (slow/high compression)." + )( + "target-encoded-size", + po::value(&m_target_encoded_size)->value_name("TARGET_ENCODED_SIZE")-> + default_value(8UL * 1024 * 1024 * 1024), // 8 GiB + "Target size (B) for the dictionaries and encoded messages before a new " + "archive is created." + )( + "timestamp-key", + po::value(&m_timestamp_key)->value_name("TIMESTAMP_COLUMN_KEY")-> + default_value(""), + "Path (e.g. x.y) for the field containing the log event's timestamp." + ); + // clang-format on + + po::positional_options_description positional_options; + positional_options.add("archives-dir", 1); + positional_options.add("input-paths", -1); + + po::options_description all_compression_options; + all_compression_options.add(compression_options); + all_compression_options.add(compression_positional_options); + + std::vector unrecognized_options + = po::collect_unrecognized(parsed.options, po::include_positional); + unrecognized_options.erase(unrecognized_options.begin()); + po::store( + po::command_line_parser(unrecognized_options) + .options(all_compression_options) + .positional(positional_options) + .run(), + parsed_command_line_options + ); + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("help")) { + print_compression_usage(); + + std::cerr << "Examples:" << std::endl; + std::cerr << " # Compress file1.json and dir1 into archives-dir" << std::endl; + std::cerr << " " << m_program_name << " c archives-dir file1.json dir1" + << std::endl; + + po::options_description visible_options; + visible_options.add(general_options); + visible_options.add(compression_options); + std::cerr << visible_options << '\n'; + return ParsingResult::InfoCommand; + } + + if (m_file_paths.empty()) { + throw std::invalid_argument("No input paths specified."); + } + + if (m_archives_dir.empty()) { + throw std::invalid_argument("No archives directory specified."); + } + } else if ((char)Command::Extract == command_input) { + po::options_description extraction_options; + // clang-format off + extraction_options.add_options()( + "archives-dir", + po::value(&m_archives_dir), + "The directory containing the archives" + )( + "output-dir", + po::value(&m_output_dir), + "The output directory for the decompressed file" + ); + // clang-format on + + po::positional_options_description positional_options; + positional_options.add("archives-dir", 1); + positional_options.add("output-dir", 1); + + std::vector unrecognized_options + = po::collect_unrecognized(parsed.options, po::include_positional); + unrecognized_options.erase(unrecognized_options.begin()); + po::store( + po::command_line_parser(unrecognized_options) + .options(extraction_options) + .positional(positional_options) + .run(), + parsed_command_line_options + ); + + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("help")) { + print_decompression_usage(); + + std::cerr << "Examples:" << std::endl; + std::cerr << " # Decompress all files from archives-dir into output-dir" + << std::endl; + std::cerr << " " << m_program_name << " x archives-dir output-dir" << std::endl; + std::cerr << std::endl; + + po::options_description visible_options; + visible_options.add(general_options); + std::cerr << visible_options << std::endl; + return ParsingResult::InfoCommand; + } + + if (m_archives_dir.empty()) { + throw std::invalid_argument("No archives directory specified"); + } + + if (m_output_dir.empty()) { + throw std::invalid_argument("No output directory specified"); + } + } else if ((char)Command::Search == command_input) { + std::string archives_dir; + std::string query; + + po::options_description search_options; + // clang-format off + search_options.add_options()( + "archives-dir", + po::value(&m_archives_dir), + "The directory containing the archives" + )( + "query,q", + po::value(&m_query), + "Query to perform" + ); + // clang-format on + + po::positional_options_description positional_options; + positional_options.add("archives-dir", 1); + positional_options.add("query", 1); + + std::vector unrecognized_options + = po::collect_unrecognized(parsed.options, po::include_positional); + unrecognized_options.erase(unrecognized_options.begin()); + po::store( + po::command_line_parser(unrecognized_options) + .options(search_options) + .positional(positional_options) + .run(), + parsed_command_line_options + ); + + po::notify(parsed_command_line_options); + + if (parsed_command_line_options.count("help")) { + print_search_usage(); + + std::cerr << "Examples:" << std::endl; + std::cerr << " # Search archives-dir for logs matching a KQL query" << std::endl; + std::cerr << " " << m_program_name << " s archives-dir kql-query" << std::endl; + std::cerr << std::endl; + + po::options_description visible_options; + visible_options.add(general_options); + std::cerr << visible_options << '\n'; + return ParsingResult::InfoCommand; + } + if (m_archives_dir.empty()) { + throw std::invalid_argument("No archives directory specified"); + } + + if (m_query.empty()) { + throw std::invalid_argument("No query specified"); + } + } + + } catch (std::exception& e) { + SPDLOG_ERROR("{}", e.what()); + print_basic_usage(); + std::cerr << "Try " << get_program_name() << " --help for detailed usage instructions" + << std::endl; + return ParsingResult::Failure; + } + + return ParsingResult::Success; +} + +void CommandLineArguments::print_basic_usage() const { + std::cerr << "Usage: " << m_program_name << " [OPTIONS] COMMAND [COMMAND ARGUMENTS]" + << std::endl; +} + +void CommandLineArguments::print_compression_usage() const { + std::cerr << "Usage: " << m_program_name << " c [OPTIONS] ARCHIVES_DIR [FILE/DIR ...]" + << std::endl; +} + +void CommandLineArguments::print_decompression_usage() const { + std::cerr << "Usage: " << m_program_name << " x [OPTIONS] ARCHIVES_DIR OUTPUT_DIR" << std::endl; +} + +void CommandLineArguments::print_search_usage() const { + std::cerr << "Usage: " << m_program_name << " s [OPTIONS] ARCHIVES_DIR KQL_QUERY" << std::endl; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp new file mode 100644 index 000000000..16e8dde74 --- /dev/null +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -0,0 +1,74 @@ +#ifndef CLP_S_COMMANDLINEARGUMENTS_HPP +#define CLP_S_COMMANDLINEARGUMENTS_HPP + +#include +#include + +namespace clp_s { +class CommandLineArguments { +public: + // Types + enum class ParsingResult { + Success = 0, + InfoCommand, + Failure + }; + + enum class Command : char { + Compress = 'c', + Extract = 'x', + Search = 's' + }; + + // Constructors + explicit CommandLineArguments(std::string const& program_name) : m_program_name(program_name) {} + + // Methods + ParsingResult parse_arguments(int argc, char const* argv[]); + + std::string const& get_program_name() const { return m_program_name; } + + Command get_command() const { return m_command; } + + std::vector const& get_file_paths() const { return m_file_paths; } + + std::string const& get_archives_dir() const { return m_archives_dir; } + + std::string const& get_output_dir() const { return m_output_dir; } + + std::string const& get_timestamp_key() const { return m_timestamp_key; } + + int get_compression_level() const { return m_compression_level; } + + size_t get_target_encoded_size() const { return m_target_encoded_size; } + + std::string const& get_query() const { return m_query; } + +private: + // Methods + void print_basic_usage() const; + + void print_compression_usage() const; + + void print_decompression_usage() const; + + void print_search_usage() const; + + // Variables + std::string m_program_name; + Command m_command; + + // Compression and decompression variables + std::vector m_file_paths; + std::string m_archives_dir; + std::string m_output_dir; + std::string m_timestamp_key; + int m_compression_level; + size_t m_target_encoded_size; + + // Search variables + std::string m_query; +}; +} // namespace clp_s + +#endif // CLP_S_COMMANDLINEARGUMENTS_HPP diff --git a/components/core/src/clp_s/Compressor.hpp b/components/core/src/clp_s/Compressor.hpp new file mode 100644 index 000000000..ba4edae0c --- /dev/null +++ b/components/core/src/clp_s/Compressor.hpp @@ -0,0 +1,51 @@ +// Code from CLP + +#ifndef CLP_S_COMPRESSOR_HPP +#define CLP_S_COMPRESSOR_HPP + +#include +#include + +#include + +#include "TraceableException.hpp" + +namespace clp_s { +class Compressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + enum class CompressorType : uint8_t { + ZSTD = 0x10, + Passthrough = 0xFF, + }; + + // Constructor + explicit Compressor(CompressorType type) : m_type(type) {} + + // Destructor + virtual ~Compressor() = default; + + // Explicitly disable copy and move constructor/assignment + Compressor(Compressor const&) = delete; + + Compressor& operator=(Compressor const&) = delete; + + // Methods + /** + * Closes the compression stream + */ + virtual void close() = 0; + +protected: + CompressorType m_type; +}; +} // namespace clp_s + +#endif // CLP_S_COMPRESSOR_HPP diff --git a/components/core/src/clp_s/Decompressor.hpp b/components/core/src/clp_s/Decompressor.hpp new file mode 100644 index 000000000..4aebec945 --- /dev/null +++ b/components/core/src/clp_s/Decompressor.hpp @@ -0,0 +1,64 @@ +// Code from CLP + +#ifndef CLP_S_DECOMPRESSOR_HPP +#define CLP_S_DECOMPRESSOR_HPP + +#include + +#include "FileReader.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +class Decompressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + enum class CompressorType : uint8_t { + ZSTD = 0x10, + Passthrough = 0xFF, + }; + + // Constructor + explicit Decompressor(CompressorType type) : m_type(type) {} + + // Destructor + ~Decompressor() = default; + + // Explicitly disable copy and move constructor/assignment + Decompressor(Decompressor const&) = delete; + + Decompressor& operator=(Decompressor const&) = delete; + + // Methods + /** + * Initializes streaming decompressor to decompress from the specified compressed data buffer + * @param compressed_data_buffer + * @param compressed_data_buffer_size + */ + virtual void open(char const* compressed_data_buffer, size_t compressed_data_buffer_size) = 0; + + /** + * Initializes the decompressor to decompress from an open file + * @param file_reader + * @param file_read_buffer_capacity The maximum amount of data to read from a file at a time + */ + virtual void open(FileReader& file_reader, size_t file_read_buffer_capacity) = 0; + + /** + * Closes decompression stream + */ + virtual void close() = 0; + +protected: + // Variables + CompressorType m_type; +}; +} // namespace clp_s + +#endif // CLP_S_DECOMPRESSOR_HPP diff --git a/components/core/src/clp_s/Defs.hpp b/components/core/src/clp_s/Defs.hpp new file mode 100644 index 000000000..090d8c0ed --- /dev/null +++ b/components/core/src/clp_s/Defs.hpp @@ -0,0 +1,44 @@ +// Code from CLP + +#ifndef CLP_S_DEFS_HPP +#define CLP_S_DEFS_HPP + +// C++ libraries +#include +#include +#include + +namespace clp_s { +// Types +typedef int64_t epochtime_t; +static epochtime_t const cEpochTimeMin = INT64_MIN; +static epochtime_t const cEpochTimeMax = INT64_MAX; +static double const cDoubleEpochTimeMin = std::numeric_limits::lowest(); +static double const cDoubleEpochTimeMax = std::numeric_limits::max(); +#define SECONDS_TO_EPOCHTIME(x) x * 1000 +#define MICROSECONDS_TO_EPOCHTIME(x) 0 + +typedef uint64_t variable_dictionary_id_t; +static variable_dictionary_id_t const cVariableDictionaryIdMax = UINT64_MAX; +typedef int64_t logtype_dictionary_id_t; +static logtype_dictionary_id_t const cLogtypeDictionaryIdMax = INT64_MAX; + +typedef uint16_t archive_format_version_t; +// This flag is used to maintain two separate streams of archive format versions: +// - Development versions (which can change frequently as necessary) which should have the flag +// - Production versions (which should be changed with care and as infrequently as possible) +// which should not have the flag +constexpr archive_format_version_t cArchiveFormatDevelopmentVersionFlag = 0x8000; + +typedef uint64_t file_id_t; +typedef uint64_t segment_id_t; +typedef int64_t encoded_variable_t; +} // namespace clp_s + +// Macros +// Relative version of __FILE__ +#define __FILENAME__ ((__FILE__) + SOURCE_PATH_SIZE) +// Rounds up VALUE to be a multiple of MULTIPLE +#define ROUND_UP_TO_MULTIPLE(VALUE, MULTIPLE) ((VALUE + MULTIPLE - 1) / MULTIPLE) * MULTIPLE + +#endif // CLP_S_DEFS_HPP diff --git a/components/core/src/clp_s/DictionaryEntry.cpp b/components/core/src/clp_s/DictionaryEntry.cpp new file mode 100644 index 000000000..379753d7e --- /dev/null +++ b/components/core/src/clp_s/DictionaryEntry.cpp @@ -0,0 +1,257 @@ +// Code from CLP + +#include "DictionaryEntry.hpp" + +#include "Utils.hpp" + +using std::string; + +namespace clp_s { +size_t LogTypeDictionaryEntry::get_var_info(size_t var_ix, VarDelim& var_delim) const { + if (var_ix >= m_var_positions.size()) { + return SIZE_MAX; + } + + auto var_position = m_var_positions[var_ix]; + var_delim = (VarDelim)m_value[var_position]; + + return m_var_positions[var_ix]; +} + +LogTypeDictionaryEntry::VarDelim LogTypeDictionaryEntry::get_var_delim(size_t var_ix) const { + if (var_ix >= m_var_positions.size()) { + return VarDelim::Length; + } + + auto var_position = m_var_positions[var_ix]; + return (VarDelim)m_value[var_position]; +} + +size_t LogTypeDictionaryEntry::get_var_length_in_logtype(size_t var_ix) const { + auto var_delim = get_var_delim(var_ix); + switch (var_delim) { + case VarDelim::NonDouble: + return 1; + case VarDelim::Double: + return 2; + case VarDelim::Length: + default: + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } +} + +size_t LogTypeDictionaryEntry::get_data_size() const { + // NOTE: sizeof(vector[0]) is executed at compile time so there's no risk of an exception at + // runtime + return sizeof(m_id) + m_value.length() + m_var_positions.size() * sizeof(m_var_positions[0]); +} + +void LogTypeDictionaryEntry::add_constant( + string const& value_containing_constant, + size_t begin_pos, + size_t length +) { + m_value.append(value_containing_constant, begin_pos, length); +} + +void LogTypeDictionaryEntry::add_non_double_var() { + m_var_positions.push_back(m_value.length()); + add_non_double_var(m_value); +} + +void LogTypeDictionaryEntry::add_double_var() { + m_var_positions.push_back(m_value.length()); + add_double_var(m_value); +} + +bool LogTypeDictionaryEntry::parse_next_var( + string const& msg, + size_t& var_begin_pos, + size_t& var_end_pos, + string& var +) { + auto last_var_end_pos = var_end_pos; + if (StringUtils::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) { + // Append to log type: from end of last variable to start of current variable + add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos); + + var.assign(msg, var_begin_pos, var_end_pos - var_begin_pos); + return true; + } + if (last_var_end_pos < msg.length()) { + // Append to log type: from end of last variable to end + add_constant(msg, last_var_end_pos, msg.length() - last_var_end_pos); + } + + return false; +} + +void LogTypeDictionaryEntry::clear() { + m_value.clear(); + m_var_positions.clear(); +} + +void LogTypeDictionaryEntry::write_to_file(ZstdCompressor& compressor) const { + string escaped_value; + get_value_with_unfounded_variables_escaped(escaped_value); + compressor.write_numeric_value(escaped_value.length()); + compressor.write_string(escaped_value); +} + +ErrorCode +LogTypeDictionaryEntry::try_read_from_file(ZstdDecompressor& decompressor, uint64_t id, bool lazy) { + clear(); + + m_id = id; + ErrorCode error_code; + uint64_t escaped_value_length; + error_code = decompressor.try_read_numeric_value(escaped_value_length); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + + string escaped_value; + error_code = decompressor.try_read_string(escaped_value_length, escaped_value); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + + if (lazy) { + m_value = std::move(escaped_value); + } else { + decode_log_type(escaped_value); + } + + return error_code; +} + +void LogTypeDictionaryEntry::read_from_file( + ZstdDecompressor& decompressor, + uint64_t id, + bool lazy +) { + auto error_code = try_read_from_file(decompressor, id, lazy); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +void LogTypeDictionaryEntry::decode_log_type(string& escaped_value) { + bool is_escaped = false; + string constant; + for (char c : escaped_value) { + if (is_escaped) { + constant += c; + is_escaped = false; + } else if (cEscapeChar == c) { + is_escaped = true; + } else { + if ((char)LogTypeDictionaryEntry::VarDelim::NonDouble == c) { + add_constant(constant, 0, constant.length()); + constant.clear(); + + add_non_double_var(); + } else if ((char)LogTypeDictionaryEntry::VarDelim::Double == c) { + add_constant(constant, 0, constant.length()); + constant.clear(); + + add_double_var(); + } else { + constant += c; + } + } + } + + if (false == constant.empty()) { + add_constant(constant, 0, constant.length()); + } + + m_init = true; +} + +void LogTypeDictionaryEntry::decode_log_type() { + string escaped_value = std::move(m_value); + m_value.clear(); + decode_log_type(escaped_value); +} + +void LogTypeDictionaryEntry::get_value_with_unfounded_variables_escaped( + string& escaped_logtype_value +) const { + size_t begin_ix = 0; + // Reset escaped value and reserve enough space to at least contain the whole value + escaped_logtype_value.clear(); + escaped_logtype_value.reserve(m_value.length()); + for (auto var_position : m_var_positions) { + size_t end_ix = var_position; + + escape_variable_delimiters(m_value, begin_ix, end_ix, escaped_logtype_value); + + // Add variable delimiter + escaped_logtype_value += m_value[end_ix]; + + // Move begin to start of next portion of logtype between variables + begin_ix = end_ix + 1; + } + // Escape any variable delimiters in remainder of value + escape_variable_delimiters(m_value, begin_ix, m_value.length(), escaped_logtype_value); +} + +void LogTypeDictionaryEntry::escape_variable_delimiters( + string const& value, + size_t begin_ix, + size_t end_ix, + string& escaped_value +) { + for (size_t i = begin_ix; i < end_ix; ++i) { + auto c = value[i]; + + // Add escape character if necessary + if ((char)LogTypeDictionaryEntry::VarDelim::NonDouble == c + || (char)LogTypeDictionaryEntry::VarDelim::Double == c || cEscapeChar == c) + { + escaped_value += cEscapeChar; + } + + // Add character + escaped_value += value[i]; + } +} + +size_t VariableDictionaryEntry::get_data_size() const { + return sizeof(m_id) + m_value.length(); +} + +void VariableDictionaryEntry::write_to_file(ZstdCompressor& compressor) const { + compressor.write_numeric_value(m_value.length()); + compressor.write_string(m_value); +} + +ErrorCode VariableDictionaryEntry::try_read_from_file(ZstdDecompressor& decompressor, uint64_t id) { + m_id = id; + + ErrorCode error_code; + uint64_t value_length; + error_code = decompressor.try_read_numeric_value(value_length); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + error_code = decompressor.try_read_string(value_length, m_value); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + + return error_code; +} + +void VariableDictionaryEntry::read_from_file( + ZstdDecompressor& decompressor, + uint64_t id, + bool lazy +) { + auto error_code = try_read_from_file(decompressor, id); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/DictionaryEntry.hpp b/components/core/src/clp_s/DictionaryEntry.hpp new file mode 100644 index 000000000..e2b15b72c --- /dev/null +++ b/components/core/src/clp_s/DictionaryEntry.hpp @@ -0,0 +1,290 @@ +// Code from CLP + +#ifndef CLP_S_DICTIONARYENTRY_HPP +#define CLP_S_DICTIONARYENTRY_HPP + +#include +#include + +#include "TraceableException.hpp" +#include "ZstdCompressor.hpp" +#include "ZstdDecompressor.hpp" + +namespace clp_s { +/** + * Template class representing a dictionary entry + * @tparam DictionaryIdType + */ +template +class DictionaryEntry { +public: + // Constructors + DictionaryEntry() = default; + + DictionaryEntry(std::string value, DictionaryIdType id) : m_value(std::move(value)), m_id(id) {} + + // Methods + DictionaryIdType get_id() const { return m_id; } + + std::string const& get_value() const { return m_value; } + +protected: + // Variables + DictionaryIdType m_id; + std::string m_value; +}; + +/** + * Class representing a logtype dictionary entry + */ +class LogTypeDictionaryEntry : public DictionaryEntry { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constants + enum class VarDelim { + // NOTE: These values are used within logtypes to denote variables, so care must be taken + // when changing them + NonDouble = 17, + Double = 18, + Length = 2, + }; + + static constexpr char cEscapeChar = '\\'; + + // Constructors + LogTypeDictionaryEntry() : m_init(false) {} + + // Use default copy constructor + LogTypeDictionaryEntry(LogTypeDictionaryEntry const&) = default; + + // Use default assignment operators + LogTypeDictionaryEntry& operator=(LogTypeDictionaryEntry const&) = default; + + // Methods + /** + * Adds a non-double variable delimiter to the given logtype + * @param logtype + */ + static void add_non_double_var(std::string& logtype) { logtype += (char)VarDelim::NonDouble; } + + /** + * Adds a double variable delimiter to the given logtype + * @param logtype + */ + static void add_double_var(std::string& logtype) { logtype += (char)VarDelim::Double; } + + /** + * @return The number of variables in the logtype + */ + size_t get_num_vars() const { return m_var_positions.size(); } + + /** + * Gets all info about a variable in the logtype + * @param var_ix The index of the variable to get the info for + * @param var_delim + * @return The variable's position in the logtype, or SIZE_MAX if var_ix is out of bounds + */ + size_t get_var_info(size_t var_ix, VarDelim& var_delim) const; + + /** + * Gets the variable delimiter at the given index + * @param var_ix The index of the variable delimiter to get + * @return The variable delimiter, or LogTypeDictionaryEntry::VarDelim::Length if var_ix is out + * of bounds + */ + VarDelim get_var_delim(size_t var_ix) const; + + /** + * Gets the length of the specified variable's representation in the logtype + * @param var_ix The index of the variable + * @return The length + */ + size_t get_var_length_in_logtype(size_t var_ix) const; + + /** + * Gets the size (in-memory) of the data contained in this entry + * @return Size of the data contained in this entry + */ + size_t get_data_size() const; + + /** + * Adds a constant to the logtype + * @param value_containing_constant + * @param begin_pos Start of the constant in value_containing_constant + * @param length + */ + void + add_constant(std::string const& value_containing_constant, size_t begin_pos, size_t length); + + /** + * Adds a non-double variable delimiter + */ + void add_non_double_var(); + + /** + * Adds a double variable delimiter + */ + void add_double_var(); + + /** + * Parses next variable from a message, constructing the constant part of the message's logtype + * as well + * @param msg + * @param var_begin_pos Beginning position of last variable. Changes to beginning position of + * current variable. + * @param var_end_pos End position of last variable (exclusive). Changes to end position of + * current variable. + * @param var + * @return true if another variable was found, false otherwise + */ + bool parse_next_var( + std::string const& msg, + size_t& var_begin_pos, + size_t& var_end_pos, + std::string& var + ); + + /** + * Reserves space for a constant of the given length + * @param length + */ + void reserve_constant_length(size_t length) { m_value.reserve(length); } + + void set_id(uint64_t id) { m_id = id; } + + /** + * Clears the entry + */ + void clear(); + + /** + * Writes an entry to a compressed file + * @param compressor + */ + void write_to_file(ZstdCompressor& compressor) const; + + /** + * Tries to read an entry from the given decompressor + * @param decompressor + * @return Same as streaming_compression::Decompressor::try_read_numeric_value + * @return Same as streaming_compression::Decompressor::try_read_string + */ + ErrorCode try_read_from_file(ZstdDecompressor& decompressor, uint64_t id, bool lazy); + + /** + * Reads an entry from the given decompressor + * @param decompressor + * @param lazy apply lazy decoding + */ + void read_from_file(ZstdDecompressor& decompressor, uint64_t id, bool lazy); + + /** + * Decodes the log type + * @param escaped_value + */ + void decode_log_type(std::string& escaped_value); + + /** + * Decodes the log type + */ + void decode_log_type(); + + /** + * Checks if the entry has been initialized + * @return true if the entry has been initialized, false otherwise + */ + bool initialized() const { return m_init; } + +private: + // Methods + /** + * Escapes any variable delimiters that don't correspond to the positions of variables in the + * logtype entry's value + * @param escaped_logtype_value + */ + void get_value_with_unfounded_variables_escaped(std::string& escaped_logtype_value) const; + + /** + * Escapes any variable delimiters in the identified portion of the given value + * @param value + * @param begin_ix + * @param end_ix + * @param escaped_value + */ + static void escape_variable_delimiters( + std::string const& value, + size_t begin_ix, + size_t end_ix, + std::string& escaped_value + ); + + // Variables + std::vector m_var_positions; + bool m_init; +}; + +class VariableDictionaryEntry : public DictionaryEntry { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + VariableDictionaryEntry() = default; + + VariableDictionaryEntry(std::string value, uint64_t id) + : DictionaryEntry(std::move(value), id) {} + + // Use default copy constructor + VariableDictionaryEntry(VariableDictionaryEntry const&) = default; + + // Assignment operators + // Use default + VariableDictionaryEntry& operator=(VariableDictionaryEntry const&) = default; + + // Methods + /** + * Gets the size (in-memory) of the data contained in this entry + * @return Size of the data contained in this entry + */ + size_t get_data_size() const; + + /** + * Clears the entry + */ + void clear() { m_value.clear(); } + + /** + * Writes an entry to a compressed file + * @param compressor + */ + void write_to_file(ZstdCompressor& compressor) const; + + /** + * Tries to read an entry from the given decompressor + * @param decompressor + * @return Same as streaming_compression::Decompressor::try_read_numeric_value + * @return Same as streaming_compression::Decompressor::try_read_string + */ + ErrorCode try_read_from_file(ZstdDecompressor& decompressor, uint64_t id); + + /** + * Reads an entry from the given decompressor + * @param decompressor + */ + void read_from_file(ZstdDecompressor& decompressor, uint64_t id, bool lazy); +}; +} // namespace clp_s + +#endif // CLP_S_DICTIONARYENTRY_HPP diff --git a/components/core/src/clp_s/DictionaryReader.hpp b/components/core/src/clp_s/DictionaryReader.hpp new file mode 100644 index 000000000..175214d88 --- /dev/null +++ b/components/core/src/clp_s/DictionaryReader.hpp @@ -0,0 +1,210 @@ +// Code from CLP + +#ifndef CLP_S_DICTIONARYREADER_HPP +#define CLP_S_DICTIONARYREADER_HPP + +#include + +#include + +#include "DictionaryEntry.hpp" +#include "Utils.hpp" + +namespace clp_s { +template +class DictionaryReader { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + DictionaryReader() : m_is_open(false) {} + + // Methods + /** + * Opens dictionary for reading + * @param dictionary_path + */ + void open(std::string const& dictionary_path); + + /** + * Closes the dictionary + */ + void close(); + + /** + * Reads any new entries from disk + */ + void read_new_entries(bool lazy = false); + + /** + * @return All dictionary entries + */ + std::vector const& get_entries() const { return m_entries; } + + /** + * @param id + * @return The entry with the given ID + */ + EntryType& get_entry(DictionaryIdType id); + + /** + * @param id + * @return Value of the entry with the specified ID + */ + std::string const& get_value(DictionaryIdType id) const; + + /** + * Gets the entry exactly matching the given search string + * @param search_string + * @param ignore_case + * @return nullptr if an exact match is not found, the entry otherwise + */ + EntryType const* + get_entry_matching_value(std::string const& search_string, bool ignore_case) const; + + /** + * Gets the entries that match a given wildcard string + * @param wildcard_string + * @param ignore_case + * @param entries Set in which to store found entries + */ + void get_entries_matching_wildcard_string( + std::string const& wildcard_string, + bool ignore_case, + std::unordered_set& entries + ) const; + +protected: + bool m_is_open; + FileReader m_dictionary_file_reader; + ZstdDecompressor m_dictionary_decompressor; + std::vector m_entries; +}; + +class VariableDictionaryReader : public DictionaryReader {}; + +class LogTypeDictionaryReader : public DictionaryReader {}; + +template +void DictionaryReader::open(std::string const& dictionary_path) { + if (m_is_open) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + constexpr size_t cDecompressorFileReadBufferCapacity = 64 * 1024; // 64 KB + + m_dictionary_file_reader.open(dictionary_path); + // Skip header + m_dictionary_file_reader.seek_from_begin(sizeof(uint64_t)); + // Open decompressor + m_dictionary_decompressor.open(m_dictionary_file_reader, cDecompressorFileReadBufferCapacity); + + m_is_open = true; +} + +template +void DictionaryReader::close() { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + m_dictionary_decompressor.close(); + m_dictionary_file_reader.close(); + + m_is_open = false; +} + +template +void DictionaryReader::read_new_entries(bool lazy) { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + auto dictionary_file_reader_pos = m_dictionary_file_reader.get_pos(); + m_dictionary_file_reader.seek_from_begin(0); + uint64_t num_dictionary_entries; + m_dictionary_file_reader.read_numeric_value(num_dictionary_entries, false); + m_dictionary_file_reader.seek_from_begin(dictionary_file_reader_pos); + + // Validate dictionary header + if (num_dictionary_entries < m_entries.size()) { + throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__); + } + + // Read new dictionary entries + if (num_dictionary_entries > m_entries.size()) { + auto prev_num_dictionary_entries = m_entries.size(); + m_entries.resize(num_dictionary_entries); + + for (size_t i = prev_num_dictionary_entries; i < num_dictionary_entries; ++i) { + auto& entry = m_entries[i]; + entry.read_from_file(m_dictionary_decompressor, i, lazy); + } + } +} + +template +EntryType& DictionaryReader::get_entry(DictionaryIdType id) { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + if (id >= m_entries.size()) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + return m_entries[id]; +} + +template +std::string const& DictionaryReader::get_value(DictionaryIdType id +) const { + if (id >= m_entries.size()) { + throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__); + } + return m_entries[id].get_value(); +} + +template +EntryType const* DictionaryReader::get_entry_matching_value( + std::string const& search_string, + bool ignore_case +) const { + if (false == ignore_case) { + for (auto const& entry : m_entries) { + if (entry.get_value() == search_string) { + return &entry; + } + } + } else { + auto const& search_string_uppercase = boost::algorithm::to_upper_copy(search_string); + for (auto const& entry : m_entries) { + if (boost::algorithm::to_upper_copy(entry.get_value()) == search_string_uppercase) { + return &entry; + } + } + } + + return nullptr; +} + +template +void DictionaryReader::get_entries_matching_wildcard_string( + std::string const& wildcard_string, + bool ignore_case, + std::unordered_set& entries +) const { + for (auto const& entry : m_entries) { + if (StringUtils::wildcard_match_unsafe(entry.get_value(), wildcard_string, !ignore_case)) { + entries.insert(&entry); + } + } +} +} // namespace clp_s + +#endif // CLP_S_DICTIONARYREADER_HPP diff --git a/components/core/src/clp_s/DictionaryWriter.cpp b/components/core/src/clp_s/DictionaryWriter.cpp new file mode 100644 index 000000000..31a4ec430 --- /dev/null +++ b/components/core/src/clp_s/DictionaryWriter.cpp @@ -0,0 +1,67 @@ +// Code from CLP + +#include "DictionaryWriter.hpp" + +namespace clp_s { +bool VariableDictionaryWriter::add_entry(std::string const& value, uint64_t& id) { + bool new_entry = false; + + auto const ix = m_value_to_id.find(value); + if (m_value_to_id.end() != ix) { + id = ix->second; + } else { + // Entry doesn't exist so create it + + if (m_next_id > m_max_id) { + SPDLOG_ERROR("VariableDictionaryWriter ran out of IDs."); + throw OperationFailed(ErrorCodeOutOfBounds, __FILENAME__, __LINE__); + } + + // Assign ID + id = m_next_id; + ++m_next_id; + + // Insert the ID obtained from the database into the dictionary + auto entry = VariableDictionaryEntry(value, id); + m_value_to_id[value] = id; + + new_entry = true; + + // TODO: This doesn't account for the segment index that's constantly updated + m_data_size += entry.get_data_size(); + + entry.write_to_file(m_dictionary_compressor); + } + return new_entry; +} + +bool LogTypeDictionaryWriter::add_entry( + LogTypeDictionaryEntry& logtype_entry, + uint64_t& logtype_id +) { + bool is_new_entry = false; + + std::string const& value = logtype_entry.get_value(); + auto const ix = m_value_to_id.find(value); + if (m_value_to_id.end() != ix) { + // Entry exists so get its ID + logtype_id = ix->second; + } else { + // Assign ID + logtype_id = m_next_id; + ++m_next_id; + logtype_entry.set_id(logtype_id); + + // Insert new entry into dictionary + m_value_to_id[value] = logtype_id; + + is_new_entry = true; + + // TODO: This doesn't account for the segment index that's constantly updated + m_data_size += logtype_entry.get_data_size(); + + logtype_entry.write_to_file(m_dictionary_compressor); + } + return is_new_entry; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/DictionaryWriter.hpp b/components/core/src/clp_s/DictionaryWriter.hpp new file mode 100644 index 000000000..3fb9ec4d1 --- /dev/null +++ b/components/core/src/clp_s/DictionaryWriter.hpp @@ -0,0 +1,158 @@ +// Code from CLP + +#ifndef CLP_S_DICTIONARYWRITER_HPP +#define CLP_S_DICTIONARYWRITER_HPP + +#include "DictionaryEntry.hpp" + +namespace clp_s { +template +class DictionaryWriter { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + DictionaryWriter() : m_is_open(false) {} + + ~DictionaryWriter() = default; + + // Methods + /** + * Opens dictionary for writing + * @param dictionary_path + * @param compression_level + * @param max_id + */ + void open(std::string const& dictionary_path, int compression_level, DictionaryIdType max_id); + + /** + * Closes the dictionary + */ + void close(); + + /** + * Writes the dictionary's header and flushes unwritten content to disk + */ + + void write_header_and_flush_to_disk(); + + /** + * @return The size (in-memory) of the data contained in the dictionary + */ + size_t get_data_size() const { return m_data_size; } + +protected: + // Types + typedef std::unordered_map value_to_id_t; + + // Variables + bool m_is_open; + + // Variables related to on-disk storage + FileWriter m_dictionary_file_writer; + ZstdCompressor m_dictionary_compressor; + + value_to_id_t m_value_to_id; + uint64_t m_next_id{}; + uint64_t m_max_id{}; + + // Size (in-memory) of the data contained in the dictionary + size_t m_data_size{}; +}; + +class VariableDictionaryWriter : public DictionaryWriter { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + /** + * Adds the given variable to the dictionary if it doesn't exist. + * @param value + * @param id ID of the variable matching the given entry + */ + bool add_entry(std::string const& value, uint64_t& id); +}; + +class LogTypeDictionaryWriter : public DictionaryWriter { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + /** + * Adds the given entry to the dictionary if it doesn't exist + * @param logtype_entry + * @param logtype_id ID of the logtype matching the given entry + */ + bool add_entry(LogTypeDictionaryEntry& logtype_entry, uint64_t& logtype_id); +}; + +template +void DictionaryWriter::open( + std::string const& dictionary_path, + int compression_level, + DictionaryIdType max_id +) { + if (m_is_open) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + m_dictionary_file_writer.open(dictionary_path, FileWriter::OpenMode::CreateForWriting); + // Write header + m_dictionary_file_writer.write_numeric_value(0); + // Open compressor + m_dictionary_compressor.open(m_dictionary_file_writer, compression_level); + + m_next_id = 0; + m_max_id = max_id; + + m_data_size = 0; + m_is_open = true; +} + +template +void DictionaryWriter::close() { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + write_header_and_flush_to_disk(); + m_dictionary_compressor.close(); + m_dictionary_file_writer.close(); + + m_value_to_id.clear(); + + m_is_open = false; +} + +template +void DictionaryWriter::write_header_and_flush_to_disk() { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + // Update header + auto dictionary_file_writer_pos = m_dictionary_file_writer.get_pos(); + m_dictionary_file_writer.seek_from_begin(0); + m_dictionary_file_writer.write_numeric_value(m_value_to_id.size()); + m_dictionary_file_writer.seek_from_begin(dictionary_file_writer_pos); + + m_dictionary_compressor.flush(); + m_dictionary_file_writer.flush(); +} +} // namespace clp_s + +#endif // CLP_S_DICTIONARYWRITER_HPP diff --git a/components/core/src/clp_s/ErrorCode.hpp b/components/core/src/clp_s/ErrorCode.hpp new file mode 100644 index 000000000..be2c78e73 --- /dev/null +++ b/components/core/src/clp_s/ErrorCode.hpp @@ -0,0 +1,31 @@ +// Code from CLP + +#ifndef CLP_S_ERRORCODE_HPP +#define CLP_S_ERRORCODE_HPP + +namespace clp_s { +typedef enum { + ErrorCodeSuccess = 0, + ErrorCodeBadParam, + ErrorCodeBadParamDbUri, + ErrorCodeCorrupt, + ErrorCodeErrno, + ErrorCodeEndOfFile, + ErrorCodeFileExists, + ErrorCodeFileNotFound, + ErrorCodeNoMem, + ErrorCodeNotInit, + ErrorCodeNotReady, + ErrorCodeOutOfBounds, + ErrorCodeTooLong, + ErrorCodeTruncated, + ErrorCodeUnsupported, + ErrorCodeNoAccess, + ErrorCodeFailure, + ErrorCodeFailureMetadataCorrupted, + ErrorCodeMetadataCorrupted, + ErrorCodeFailureDbBulkWrite +} ErrorCode; +} // namespace clp_s + +#endif // CLP_S_ERRORCODE_HPP diff --git a/components/core/src/clp_s/FileReader.cpp b/components/core/src/clp_s/FileReader.cpp new file mode 100644 index 000000000..91bafed0a --- /dev/null +++ b/components/core/src/clp_s/FileReader.cpp @@ -0,0 +1,150 @@ +// Code from CLP + +#include "FileReader.hpp" + +#include +#include + +#include +#include + +using std::string; + +namespace clp_s { +FileReader::~FileReader() { + close(); + free(m_getdelim_buf); +} + +ErrorCode FileReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + if (nullptr == buf) { + return ErrorCodeBadParam; + } + + num_bytes_read = fread(buf, sizeof(*buf), num_bytes_to_read, m_file); + if (num_bytes_read < num_bytes_to_read) { + if (ferror(m_file)) { + return ErrorCodeErrno; + } else if (feof(m_file)) { + if (0 == num_bytes_read) { + return ErrorCodeEndOfFile; + } + } + } + + return ErrorCodeSuccess; +} + +ErrorCode FileReader::try_seek_from_begin(size_t pos) { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + + int retval = fseeko(m_file, pos, SEEK_SET); + if (0 != retval) { + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +ErrorCode FileReader::try_get_pos(size_t& pos) { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + + pos = ftello(m_file); + if ((off_t)-1 == pos) { + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +ErrorCode FileReader::try_open(string const& path) { + // Cleanup in case caller forgot to call close before calling this function + close(); + + m_file = fopen(path.c_str(), "rb"); + if (nullptr == m_file) { + if (ENOENT == errno) { + return ErrorCodeFileNotFound; + } + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +void FileReader::open(string const& path) { + ErrorCode error_code = try_open(path); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +void FileReader::close() { + if (m_file != nullptr) { + // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if + // it was interrupted by a signal + fclose(m_file); + m_file = nullptr; + } +} + +ErrorCode +FileReader::try_read_to_delimiter(char delim, bool keep_delimiter, bool append, string& str) { + assert(nullptr != m_file); + + if (false == append) { + str.clear(); + } + ssize_t num_bytes_read = getdelim(&m_getdelim_buf, &m_getdelim_buf_len, delim, m_file); + if (num_bytes_read < 1) { + if (ferror(m_file)) { + return ErrorCodeErrno; + } else if (feof(m_file)) { + return ErrorCodeEndOfFile; + } + } + if (false == keep_delimiter && delim == m_getdelim_buf[num_bytes_read - 1]) { + --num_bytes_read; + } + str.append(m_getdelim_buf, num_bytes_read); + + return ErrorCodeSuccess; +} + +ErrorCode FileReader::try_read_exact_length(char* buf, size_t num_bytes) { + size_t num_bytes_read; + auto error_code = try_read(buf, num_bytes, num_bytes_read); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + if (num_bytes_read < num_bytes) { + return ErrorCodeTruncated; + } + + return ErrorCodeSuccess; +} + +size_t FileReader::get_pos() { + size_t pos; + ErrorCode error_code = try_get_pos(pos); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + return pos; +} + +void FileReader::seek_from_begin(size_t pos) { + ErrorCode error_code = try_seek_from_begin(pos); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/FileReader.hpp b/components/core/src/clp_s/FileReader.hpp new file mode 100644 index 000000000..59e88eaec --- /dev/null +++ b/components/core/src/clp_s/FileReader.hpp @@ -0,0 +1,166 @@ +// Code from CLP + +#ifndef CLP_S_FILEREADER_HPP +#define CLP_S_FILEREADER_HPP + +#include +#include + +#include "ErrorCode.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +class FileReader { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + FileReader() : m_file(nullptr), m_getdelim_buf_len(0), m_getdelim_buf(nullptr) {} + + // Destructor + ~FileReader(); + + // Methods implementing the ReaderInterface + /** + * Tries to get the current position of the read head in the file + * @param pos Position of the read head in the file + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeErrno on error + * @return ErrorCodeSuccess on success + */ + ErrorCode try_get_pos(size_t& pos); + + /** + * Tries to seek from the beginning of the file to the given position + * @param pos The position to seek to + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeErrno on error + * @return ErrorCodeSuccess on success + */ + ErrorCode try_seek_from_begin(size_t pos); + + /** + * Tries to read up to a given number of bytes from the file + * @param buf The buffer to read into + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeBadParam if buf is invalid + * @return ErrorCodeErrno on error + * @return ErrorCodeEndOfFile on EOF + * @return ErrorCodeSuccess on success + */ + ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read); + + /** + * Tries to read a string from the file until it reaches the specified delimiter + * @param delim The delimiter to stop at + * @param keep_delimiter Whether to include the delimiter in the output string or not + * @param append Whether to append to the given string or replace its contents + * @param str The string read + * @return ErrorCodeSuccess on success + * @return ErrorCodeEndOfFile on EOF + * @return ErrorCodeErrno otherwise + */ + ErrorCode try_read_to_delimiter(char delim, bool keep_delimiter, bool append, std::string& str); + + /** + * Tries to read a number of bytes + * @param buf The buffer to read into + * @param num_bytes Number of bytes to read + * @return Same as the underlying medium's try_read method + * @return ErrorCodeTruncated if 0 < # bytes read < num_bytes + */ + ErrorCode try_read_exact_length(char* buf, size_t num_bytes); + + /** + * Tries to read a numeric value + * @tparam ValueType The type of the value to read + * @param value The value read + * @return Same as the underlying medium's try_read_exact_length method + */ + template + ErrorCode try_read_numeric_value(ValueType& value) { + ErrorCode error_code + = try_read_exact_length(reinterpret_cast(&value), sizeof(value)); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + return ErrorCodeSuccess; + } + + /** + * Reads a numeric value + * @tparam ValueType The type of the value to read + * @param value The value read + * @param eof_possible Whether EOF is possible or not + * @return true on success + * @return false on EOF if eof_possible is true + * @throw FileReader::OperationFailed on failure + */ + template + bool read_numeric_value(ValueType& value, bool eof_possible) { + ErrorCode error_code = try_read_numeric_value(value); + if (ErrorCodeEndOfFile == error_code && eof_possible) { + return false; + } + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + return true; + } + + // Methods + /** + * Checks if the file is open + * @return true if the file is open, false otherwise + */ + bool is_open() const { return m_file != nullptr; } + + /** + * Tries to open a file + * @param path + * @return ErrorCodeSuccess on success + * @return ErrorCodeFileNotFound if the file was not found + * @return ErrorCodeErrno otherwise + */ + ErrorCode try_open(std::string const& path); + + /** + * Opens a file + * @param path + * @throw FileReader::OperationFailed on failure + */ + void open(std::string const& path); + + /** + * Closes the file if it's open + */ + void close(); + + /** + * Gets the current position of the read head + * @return Position of the read head + */ + size_t get_pos(); + + /** + * Seeks from the beginning to the given position + * @param pos + */ + void seek_from_begin(size_t pos); + +private: + FILE* m_file; + size_t m_getdelim_buf_len; + char* m_getdelim_buf; +}; +} // namespace clp_s + +#endif // CLP_S_FILEREADER_HPP diff --git a/components/core/src/clp_s/FileWriter.cpp b/components/core/src/clp_s/FileWriter.cpp new file mode 100644 index 000000000..49540881b --- /dev/null +++ b/components/core/src/clp_s/FileWriter.cpp @@ -0,0 +1,165 @@ +// Code from CLP + +#include "FileWriter.hpp" + +#include +#include + +#include + +#include + +using std::string; + +namespace clp_s { +FileWriter::~FileWriter() { + if (nullptr != m_file) { + SPDLOG_ERROR("FileWriter not closed before being destroyed - may cause data loss"); + } +} + +void FileWriter::write(char const* data, size_t data_length) { + ErrorCode error_code = ErrorCodeSuccess; + if (nullptr == m_file) { + error_code = ErrorCodeNotInit; + } else if (nullptr == data) { + error_code = ErrorCodeBadParam; + } else { + size_t num_bytes_written = fwrite(data, sizeof(*data), data_length, m_file); + if (num_bytes_written < data_length) { + error_code = ErrorCodeErrno; + } + } + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +void FileWriter::flush() { +#if !FLUSH_TO_DISK_ENABLED + return; +#endif + // Flush userspace buffers to page cache + if (0 != fflush(m_file)) { + SPDLOG_ERROR("fflush failed, errno={}", errno); + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + + // Flush page cache pages to disk + if (0 != fsync(m_fd)) { + SPDLOG_ERROR("fdatasync failed, errno={}", errno); + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } +} + +size_t FileWriter::get_pos() { + size_t pos; + ErrorCode error_code = try_get_pos(pos); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + return pos; +} + +ErrorCode FileWriter::try_get_pos(size_t& pos) const { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + + pos = ftello(m_file); + if ((off_t)-1 == pos) { + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +void FileWriter::seek_from_begin(size_t pos) { + auto error_code = try_seek_from_begin(pos); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +ErrorCode FileWriter::try_seek_from_begin(size_t pos) { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + + int retval = fseeko(m_file, pos, SEEK_SET); + if (0 != retval) { + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +ErrorCode FileWriter::try_seek_from_current(off_t offset) { + if (nullptr == m_file) { + return ErrorCodeNotInit; + } + + int retval = fseeko(m_file, offset, SEEK_CUR); + if (0 != retval) { + return ErrorCodeErrno; + } + + return ErrorCodeSuccess; +} + +void FileWriter::open(string const& path, OpenMode open_mode) { + if (nullptr != m_file) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + switch (open_mode) { + case OpenMode::CreateForWriting: + m_file = fopen(path.c_str(), "wb"); + break; + case OpenMode::CreateIfNonexistentForAppending: + m_file = fopen(path.c_str(), "ab"); + break; + case OpenMode::CreateIfNonexistentForSeekableWriting: { + struct stat stat_buf = {}; + if (0 == stat(path.c_str(), &stat_buf)) { + // File exists, so open it for seekable writing + m_file = fopen(path.c_str(), "r+b"); + } else { + if (ENOENT != errno) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + // File doesn't exist, so create and open it for seekable writing + // NOTE: We can't use the "w+" mode if the file exists since that will truncate the + // file + m_file = fopen(path.c_str(), "w+b"); + } + + auto retval = fseek(m_file, 0, SEEK_END); + if (0 != retval) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + break; + } + } + if (nullptr == m_file) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + + m_fd = fileno(m_file); + if (-1 == m_fd) { + fclose(m_file); + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } +} + +void FileWriter::close() { + if (nullptr != m_file) { + if (0 != fclose(m_file)) { + throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); + } + m_file = nullptr; + m_fd = -1; + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/FileWriter.hpp b/components/core/src/clp_s/FileWriter.hpp new file mode 100644 index 000000000..edcd3a2c4 --- /dev/null +++ b/components/core/src/clp_s/FileWriter.hpp @@ -0,0 +1,122 @@ +// Code from CLP + +#ifndef CLP_S_FILEWRITER_HPP +#define CLP_S_FILEWRITER_HPP + +#include +#include + +#include "ErrorCode.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +class FileWriter { +public: + // Types + enum class OpenMode { + CreateForWriting, + CreateIfNonexistentForAppending, + CreateIfNonexistentForSeekableWriting, + }; + + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + FileWriter() : m_file(nullptr), m_fd(-1) {} + + // Destructor + ~FileWriter(); + + // Methods implementing the WriterInterface + /** + * Writes a buffer to the file + * @param data + * @param data_length Length of the buffer + * @throw FileWriter::OperationFailed on failure + */ + void write(char const* data, size_t data_length); + + /** + * Writes a numeric value to the file + * @param val + * @tparam ValueType + */ + template + void write_numeric_value(ValueType val) { + write(reinterpret_cast(&val), sizeof(val)); + } + + /** + * Flushes the file + * @throw FileWriter::OperationFailed on failure + */ + void flush(); + + /** + * Gets the current position of the write head in the file + * @return Position of the write head in the file + * @throw FileWriter::OperationFailed on failure + */ + size_t get_pos(); + + /** + * Tries to get the current position of the write head in the file + * @param pos Position of the write head in the file + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeErrno on error + * @return ErrorCodeSuccess on success + */ + ErrorCode try_get_pos(size_t& pos) const; + + /** + * Seeks from the beginning of the file to the given position + * @param pos The position to seek to + * @throw FileWriter::OperationFailed on failure + */ + void seek_from_begin(size_t pos); + + /** + * Tries to seek from the beginning of the file to the given position + * @param pos + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeErrno on error + * @return ErrorCodeSuccess on success + */ + ErrorCode try_seek_from_begin(size_t pos); + + /** + * Tries to offset from the current position by the given amount + * @param pos + * @return ErrorCodeNotInit if the file is not open + * @return ErrorCodeErrno on error + * @return ErrorCodeSuccess on success + */ + ErrorCode try_seek_from_current(off_t offset); + + // Methods + /** + * Opens a file for writing + * @param path + * @param open_mode The mode to open the file with + * @throw FileWriter::OperationFailed on failure + */ + void open(std::string const& path, OpenMode open_mode); + + /** + * Closes the file + * @throw FileWriter::OperationFailed on failure + */ + void close(); + +private: + FILE* m_file; + int m_fd; +}; +} // namespace clp_s + +#endif // CLP_S_FILEWRITER_HPP diff --git a/components/core/src/clp_s/JsonConstructor.cpp b/components/core/src/clp_s/JsonConstructor.cpp new file mode 100644 index 000000000..433ed76c6 --- /dev/null +++ b/components/core/src/clp_s/JsonConstructor.cpp @@ -0,0 +1,72 @@ +#include "JsonConstructor.hpp" + +#include + +#include "ReaderUtils.hpp" +#include "SchemaTree.hpp" + +namespace clp_s { +JsonConstructor::JsonConstructor(JsonConstructorOption const& option) + : m_output_dir(option.output_dir), + m_archives_dir(option.archives_dir), + m_current_archive_index(0), + m_max_archive_index(0) { + if (false == boost::filesystem::create_directory(m_output_dir)) { + SPDLOG_ERROR("Can not create directory '{}'", m_output_dir); + exit(1); + } + + if (false == boost::filesystem::is_directory(m_archives_dir)) { + SPDLOG_ERROR("'{}' is not a directory", m_archives_dir); + exit(1); + } + + boost::filesystem::directory_iterator iter(m_archives_dir); + boost::filesystem::directory_iterator end; + + for (; iter != end; ++iter) { + if (boost::filesystem::is_directory(iter->path())) { + m_archive_paths.push_back(iter->path().string()); + } + } + + if (m_archive_paths.empty()) { + SPDLOG_ERROR("No archives in '{}'", m_archives_dir); + exit(1); + } + + m_max_archive_index = m_archive_paths.size() - 1; +} + +void JsonConstructor::construct() { + constexpr size_t cDecompressorFileReadBufferCapacity = 64 * 1024; // 64 KB + + m_schema_tree = ReaderUtils::read_schema_tree(m_archives_dir); + auto id_to_schema = ReaderUtils::read_schemas(m_archives_dir); + + auto timestamp_dict = ReaderUtils::read_timestamp_dictionary(m_archives_dir); + + m_archive_reader + = std::make_unique(m_schema_tree, *id_to_schema, timestamp_dict); +} + +void JsonConstructor::store() { + FileWriter writer; + writer.open(m_output_dir + "/original", FileWriter::OpenMode::CreateForWriting); + + while (m_current_archive_index <= m_max_archive_index) { + ArchiveReaderOption option; + option.archive_path = m_archive_paths[m_current_archive_index]; + m_archive_reader->open(option); + m_archive_reader->store(writer); + m_archive_reader->close(); + m_current_archive_index++; + } + + writer.close(); +} + +void JsonConstructor::close() { + // archive_reader_->Close(); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/JsonConstructor.hpp b/components/core/src/clp_s/JsonConstructor.hpp new file mode 100644 index 000000000..12a30d9fd --- /dev/null +++ b/components/core/src/clp_s/JsonConstructor.hpp @@ -0,0 +1,59 @@ +#ifndef CLP_S_JSONCONSTRUCTOR_HPP +#define CLP_S_JSONCONSTRUCTOR_HPP + +#include +#include + +#include "ArchiveReader.hpp" +#include "ColumnReader.hpp" +#include "DictionaryReader.hpp" +#include "SchemaReader.hpp" +#include "SchemaTree.hpp" + +namespace clp_s { +struct JsonConstructorOption { + std::string archives_dir; + std::string output_dir; +}; + +class JsonConstructor { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + explicit JsonConstructor(JsonConstructorOption const& option); + + /** + * Reads the schema information + */ + void construct(); + + /** + * Decompresses each archive and stores the decompressed files in the output directory + */ + void store(); + + /** + * Closes the JsonConstructor + */ + void close(); + +private: + std::string m_archives_dir; + std::string m_output_dir; + + std::unique_ptr m_archive_reader; + std::vector m_archive_paths; + size_t m_current_archive_index; + size_t m_max_archive_index; + + std::shared_ptr m_schema_tree; +}; +} // namespace clp_s + +#endif // CLP_S_JSONCONSTRUCTOR_HPP diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp new file mode 100644 index 000000000..0b6499502 --- /dev/null +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -0,0 +1,129 @@ +#include "JsonFileIterator.hpp" + +#include + +namespace clp_s { +JsonFileIterator::JsonFileIterator(std::string const& file_name, size_t buf_size) { + m_buf = nullptr; + + try { + m_reader.open(file_name); + } catch (FileReader::OperationFailed& e) { + return; + } + + m_eof = false; + m_buf_size = buf_size; + m_buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + m_buf_occupied = 0; + m_first_read = true; + m_bytes_read = 0; + + read_new_json(/*truncated_bytes=*/0); +} + +JsonFileIterator::~JsonFileIterator() { + delete[] m_buf; + if (false == m_reader.is_open()) { + m_reader.close(); + } +} + +void JsonFileIterator::read_new_json(size_t truncated_bytes) { + if (truncated_bytes == m_buf_size) { + // double buffer size to attempt to capture long json object + size_t new_buf_size = m_buf_size * 2; + char* new_buf = new char[new_buf_size + simdjson::SIMDJSON_PADDING]; + memcpy(new_buf, m_buf, m_buf_size); + delete[] m_buf; + m_buf = new_buf; + m_buf_size = new_buf_size; + } else if (truncated_bytes > 0) { + // move bytes to start of buffer + memmove(m_buf, m_buf + (m_buf_occupied - truncated_bytes), truncated_bytes); + m_buf_occupied = truncated_bytes; + } else { + m_buf_occupied = 0; + } + + size_t size_read = 0; + auto error = m_reader.try_read(m_buf + m_buf_occupied, m_buf_size - m_buf_occupied, size_read); + m_buf_occupied += size_read; + m_bytes_read += size_read; + + if (error != ErrorCodeSuccess) { + m_eof = true; + } + + m_parser.iterate_many( + m_buf, + /* length of data */ m_buf_occupied, + /* batch size of data to parse*/ m_buf_occupied + ) + .get(m_stream); + + m_doc_it = m_stream.begin(); + // only implements != so this is equivalent to + // if no json available from buffer and we haven't hit eof + // then retry reading the json with a larger buffer up to eof + if (false == (m_doc_it != m_stream.end()) && false == m_eof) { + read_new_json(m_stream.truncated_bytes()); + } +} + +bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) { + if (false == m_first_read) { + ++m_doc_it; + } else { + m_first_read = false; + } + + size_t patch_truncated_bytes = 0; + if (m_doc_it != m_stream.end()) { + if (m_doc_it.error() == simdjson::error_code::SUCCESS) { + it = m_doc_it; + return true; + } else if (m_doc_it.error() == simdjson::error_code::UTF8_ERROR) { + patch_truncated_bytes + = reverse_search_newline_truncated_bytes(m_doc_it.current_index()); + } + } else if (m_eof) { + return false; + } + + // there is a bug in simdjson where when invalid utf8 is encountered + // at the end of the stream truncated bytes isn't set correctly. + // Work around this limitation by manually searching for the start + // of the erroring document and stetting truncated bytes appropriately + if (patch_truncated_bytes == 0) { + read_new_json(m_stream.truncated_bytes()); + } else { + read_new_json(patch_truncated_bytes); + } + + if (m_doc_it != m_stream.end()) { + if (m_doc_it.error() == simdjson::error_code::SUCCESS) { + it = m_doc_it; + return true; + } + } + + return false; +} + +size_t JsonFileIterator::reverse_search_newline_truncated_bytes(size_t start) { + if (m_buf_occupied == 0) { + return 0; + } + + if (start > m_buf_occupied) { + start = m_buf_occupied - 1; + } + + while (start > 0 && m_buf[start] != '\n') { + --start; + } + + return m_buf_occupied - start - 1; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp new file mode 100644 index 000000000..2677875b4 --- /dev/null +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -0,0 +1,75 @@ +#ifndef CLP_S_JSONFILEITERATOR_HPP +#define CLP_S_JSONFILEITERATOR_HPP + +#include + +#include "FileReader.hpp" + +namespace clp_s { +class JsonFileIterator { +public: + /** + * An iterator over a file containing json objects. JSON is parsed + * using simdjson::parse_many. This allows simdjson to efficiently find + * delimeters between JSON objects, and if enabled parse JSON ahead of time + * in another thread while the JSON is being iterated over. + * + * The buffer grows automatically if there are JSON objects larger than the buffer size. + * The buffer is padded to be SIMDJSON_PADDING bytes larger than the specified size. + + * @param file_name the file containing JSON + * @param buf_size the initial buffer size + */ + explicit JsonFileIterator( + std::string const& file_name, + size_t buf_size = 1024 * 1024 /*1MB default*/ + ); + ~JsonFileIterator(); + + /** + * Reads the next JSON document and returns it in the it argument + * @param it an iterator to the JSON object that gets returned + * @return true if the iterator is valid, false otherwise + */ + bool get_json(simdjson::ondemand::document_stream::iterator& it); + + /** + * Checks if the file is open + * @return true if the file opened successfully + */ + bool is_open() { return m_reader.is_open(); } + + /** + * @return number of truncated bytes after json documents + */ + size_t truncated_bytes() { + if (m_stream.size_in_bytes() != 0) { + return m_stream.truncated_bytes(); + } + return 0; + } + +private: + /** + * Reads new JSON into the buffer and initializes iterators into the data. + * If the buffer is not large enough to contain the JSON its size is doubled. + * @param truncated_bytes length of incomplete JSON at end of buffer in bytes + */ + void read_new_json(size_t truncated_bytes); + + size_t reverse_search_newline_truncated_bytes(size_t start); + + size_t m_bytes_read; + size_t m_buf_size; + size_t m_buf_occupied; + char* m_buf; + FileReader m_reader; + simdjson::ondemand::parser m_parser; + simdjson::ondemand::document_stream m_stream; + bool m_eof; + bool m_first_read; + simdjson::ondemand::document_stream::iterator m_doc_it; +}; +} // namespace clp_s + +#endif // CLP_S_JSONFILEITERATOR_HPP diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp new file mode 100644 index 000000000..d13bbc729 --- /dev/null +++ b/components/core/src/clp_s/JsonParser.cpp @@ -0,0 +1,298 @@ +#include "JsonParser.hpp" + +#include +#include + +#include "JsonFileIterator.hpp" + +namespace clp_s { +JsonParser::JsonParser(JsonParserOption const& option) + : m_archives_dir(option.archives_dir), + m_num_messages(0), + m_compression_level(option.compression_level), + m_target_encoded_size(option.target_encoded_size), + m_timestamp_column(option.timestamp_column) { + if (false == boost::filesystem::create_directory(m_archives_dir)) { + SPDLOG_ERROR("The output directory '{}' already exists", m_archives_dir); + exit(1); + } + + if (false == FileUtils::validate_path(option.file_paths)) { + exit(1); + } + + for (auto& file_path : option.file_paths) { + FileUtils::find_all_files(file_path, m_file_paths); + } + + m_schema_tree = std::make_shared(); + m_schema_tree_path = m_archives_dir + "/schema_tree"; + + m_schema_map = std::make_shared(m_archives_dir, m_compression_level); + + m_timestamp_dictionary = std::make_shared(); + m_timestamp_dictionary->open(m_archives_dir + "/timestamp.dict", option.compression_level); + + ArchiveWriterOption archive_writer_option; + archive_writer_option.archives_dir = m_archives_dir; + archive_writer_option.id = m_generator(); + archive_writer_option.compression_level = option.compression_level; + + m_archive_writer = std::make_unique(m_schema_tree, m_timestamp_dictionary); + m_archive_writer->open(archive_writer_option); +} + +void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key) { + int32_t node_id; + std::stack object_stack; + std::stack node_id_stack; + std::stack object_it_stack; + + ondemand::field cur_field; + + std::string cur_key = key; + node_id_stack.push(parent_node_id); + + bool can_match_timestamp = !m_timestamp_column.empty(); + bool may_match_timestamp = can_match_timestamp; + int longest_matching_timestamp_prefix = 0; + bool matches_timestamp = false; + + do { + if (false == object_stack.empty()) { + cur_field = *object_it_stack.top(); + cur_key = std::string(std::string_view(cur_field.unescaped_key(true))); + line = cur_field.value(); + if (may_match_timestamp) { + if (object_stack.size() <= m_timestamp_column.size() + && cur_key == m_timestamp_column[object_stack.size() - 1]) + { + if (object_stack.size() == m_timestamp_column.size()) { + // FIXME: technically need to handle the case where this + // isn't a string or number column by resetting matches_timestamp + // to false + matches_timestamp = true; + } + } else { + longest_matching_timestamp_prefix = object_stack.size() - 1; + may_match_timestamp = false; + } + } + } + + switch (line.type()) { + case ondemand::json_type::object: { + node_id = m_schema_tree->add_node(node_id_stack.top(), NodeType::OBJECT, cur_key); + object_stack.push(std::move(line.get_object())); + auto objref = object_stack.top(); + auto it = ondemand::object_iterator(objref.begin()); + if (it == objref.end()) { + m_current_schema.insert(node_id); + object_stack.pop(); + break; + } else { + object_it_stack.push(it); + node_id_stack.push(node_id); + continue; + } + } + case ondemand::json_type::array: { + std::string value = std::string(std::string_view(simdjson::to_json_string(line))); + node_id = m_schema_tree->add_node(node_id_stack.top(), NodeType::ARRAY, cur_key); + m_current_parsed_message.add_value(node_id, value); + m_current_schema.insert(node_id); + break; + } + case ondemand::json_type::number: { + NodeType type; + ondemand::number number_value = line.get_number(); + if (false == number_value.is_double()) { + // FIXME: should have separate integer and unsigned + // integer types to handle values greater than max int64 + type = NodeType::INTEGER; + } else { + type = NodeType::FLOAT; + } + node_id = m_schema_tree->add_node(node_id_stack.top(), type, cur_key); + + if (type == NodeType::INTEGER) { + int64_t i64_value; + if (number_value.is_uint64()) { + i64_value = static_cast(number_value.get_uint64()); + } else { + i64_value = line.get_int64(); + } + + m_current_parsed_message.add_value(node_id, i64_value); + if (matches_timestamp) { + m_timestamp_dictionary->ingest_entry(cur_key, i64_value); + matches_timestamp = may_match_timestamp = can_match_timestamp = false; + } + } else { + double double_value = line.get_double(); + m_current_parsed_message.add_value(node_id, double_value); + if (matches_timestamp) { + m_timestamp_dictionary->ingest_entry(cur_key, double_value); + matches_timestamp = may_match_timestamp = can_match_timestamp = false; + } + } + m_current_schema.insert(node_id); + break; + } + case ondemand::json_type::string: { + // TODO (Rui): Take a look + std::string value = std::string( + line.raw_json_token().substr(1, line.raw_json_token().size() - 2) + ); + if (matches_timestamp) { + double ret_double; + if (StringUtils::convert_string_to_double(value, ret_double)) { + node_id = m_schema_tree->add_node( + node_id_stack.top(), + NodeType::FLOATDATESTRING, + cur_key + ); + m_current_parsed_message.add_value(node_id, ret_double); + } else { + node_id = m_schema_tree->add_node( + node_id_stack.top(), + NodeType::DATESTRING, + cur_key + ); + m_current_parsed_message.add_value(node_id, value); + } + matches_timestamp = may_match_timestamp = can_match_timestamp = false; + } else if (value.find(' ') != std::string::npos) { + node_id = m_schema_tree + ->add_node(node_id_stack.top(), NodeType::CLPSTRING, cur_key); + m_current_parsed_message.add_value(node_id, value); + } else { + node_id = m_schema_tree + ->add_node(node_id_stack.top(), NodeType::VARSTRING, cur_key); + m_current_parsed_message.add_value(node_id, value); + } + + m_current_schema.insert(node_id); + break; + } + case ondemand::json_type::boolean: { + bool value = line.get_bool(); + node_id = m_schema_tree->add_node(node_id_stack.top(), NodeType::BOOLEAN, cur_key); + + m_current_parsed_message.add_value(node_id, value); + m_current_schema.insert(node_id); + break; + } + case ondemand::json_type::null: { + node_id = m_schema_tree + ->add_node(node_id_stack.top(), NodeType::NULLVALUE, cur_key); + m_current_schema.insert(node_id); + break; + } + } + + if (object_stack.empty()) { + break; + } + + bool hit_end; + do { + hit_end = false; + ++object_it_stack.top(); + if (object_it_stack.top() == object_stack.top().end()) { + object_it_stack.pop(); + object_stack.pop(); + node_id_stack.pop(); + hit_end = true; + } + if (can_match_timestamp + && (object_it_stack.size() - 1) <= longest_matching_timestamp_prefix) + { + may_match_timestamp = true; + } + } while (!object_it_stack.empty() && hit_end); + } + + while (!object_stack.empty()); +} + +void JsonParser::parse() { + for (auto& file_path : m_file_paths) { + JsonFileIterator json_file_iterator(file_path); + if (false == json_file_iterator.is_open()) { + return; + } + + simdjson::ondemand::document_stream::iterator json_it; + + m_num_messages = 0; + + while (json_file_iterator.get_json(json_it)) { + m_current_schema.clear(); + + parse_line((*json_it).value(), -1, "root"); + m_num_messages++; + + int32_t current_schema_id = m_schema_map->add_schema(m_current_schema); + m_current_parsed_message.set_id(current_schema_id); + + if (m_archive_writer->get_data_size() >= m_target_encoded_size) { + split_archive(); + } + + m_archive_writer + ->append_message(current_schema_id, m_current_schema, m_current_parsed_message); + m_current_parsed_message.clear(); + } + + if (json_file_iterator.truncated_bytes() > 0) { + SPDLOG_ERROR( + "Truncated JSON ({} bytes) at end of file {}", + json_file_iterator.truncated_bytes(), + file_path.c_str() + ); + } + } +} + +void JsonParser::store() { + FileWriter schema_tree_writer; + ZstdCompressor schema_tree_compressor; + + schema_tree_writer.open(m_schema_tree_path, FileWriter::OpenMode::CreateForWriting); + schema_tree_compressor.open(schema_tree_writer, m_compression_level); + + auto nodes = m_schema_tree->get_nodes(); + schema_tree_compressor.write_numeric_value(nodes.size()); + for (auto const& node : nodes) { + schema_tree_compressor.write_numeric_value(node->get_parent_id()); + + std::string const& key = node->get_key_name(); + schema_tree_compressor.write_numeric_value(key.size()); + schema_tree_compressor.write_string(key); + schema_tree_compressor.write_numeric_value(node->get_type()); + } + + schema_tree_compressor.close(); + schema_tree_writer.close(); + + m_schema_map->store(); + + m_timestamp_dictionary->close(); +} + +void JsonParser::split_archive() { + m_archive_writer->close(); + + ArchiveWriterOption archive_writer_option; + archive_writer_option.archives_dir = m_archives_dir; + archive_writer_option.id = m_generator(); + archive_writer_option.compression_level = m_compression_level; + + m_archive_writer->open(archive_writer_option); +} + +void JsonParser::close() { + m_archive_writer->close(); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp new file mode 100644 index 000000000..96250129e --- /dev/null +++ b/components/core/src/clp_s/JsonParser.hpp @@ -0,0 +1,101 @@ +#ifndef CLP_S_JSONPARSER_HPP +#define CLP_S_JSONPARSER_HPP + +#include +#include +#include +#include +#include + +#include +#include + +#include "ArchiveWriter.hpp" +#include "DictionaryWriter.hpp" +#include "FileReader.hpp" +#include "FileWriter.hpp" +#include "ParsedMessage.hpp" +#include "SchemaMap.hpp" +#include "SchemaTree.hpp" +#include "SchemaWriter.hpp" +#include "TimestampDictionaryWriter.hpp" +#include "Utils.hpp" +#include "ZstdCompressor.hpp" + +using namespace simdjson; + +namespace clp_s { +struct JsonParserOption { + std::vector file_paths; + std::vector timestamp_column; + std::string archives_dir; + size_t target_encoded_size; + int compression_level; +}; + +class JsonParser { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + explicit JsonParser(JsonParserOption const& option); + + // Destructor + ~JsonParser() = default; + + /** + * Parses the JSON log messages and store the parsed data in the archive. + */ + void parse(); + + /** + * Writes the metadata and archive data to disk. + */ + void store(); + + /** + * Closes the archive and clean up. + */ + void close(); + +private: + /** + * Parses a JSON line + * @param line the JSON line + * @param parent_node_id the parent node id + * @param key the key of the node + */ + void parse_line(ondemand::value line, int32_t parent_node_id, std::string const& key); + + /** + * Splits the archive if the size of the archive exceeds the maximum size + */ + void split_archive(); + + int m_num_messages; + int m_compression_level; + std::vector m_file_paths; + std::string m_archives_dir; + std::string m_schema_tree_path; + + std::set m_current_schema; + std::shared_ptr m_schema_map; + + std::shared_ptr m_schema_tree; + ParsedMessage m_current_parsed_message; + std::shared_ptr m_timestamp_dictionary; + + std::vector m_timestamp_column; + + boost::uuids::random_generator m_generator; + std::unique_ptr m_archive_writer; + size_t m_target_encoded_size; +}; +} // namespace clp_s + +#endif // CLP_S_JSONPARSER_HPP diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp new file mode 100644 index 000000000..9c45e6b27 --- /dev/null +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -0,0 +1,83 @@ +#ifndef CLP_S_JSONSERIALIZER_HPP +#define CLP_S_JSONSERIALIZER_HPP + +#include +#include + +class JsonSerializer { +public: + enum Op : uint8_t { + BeginObject, + EndObject, + AddIntField, + AddFloatField, + AddBoolField, + AddStringField, + AddArrayField, + AddNullField, + }; + + static int64_t const cReservedLength = 4096; + + explicit JsonSerializer(int64_t reserved_length = cReservedLength) : m_special_keys_index(0) { + m_json_string.reserve(cReservedLength); + } + + std::string& get_serialized_string() { return m_json_string; } + + void reset() { + m_json_string.clear(); + m_op_list_index = 0; + m_special_keys_index = 0; + } + + void add_op(Op op) { m_op_list.push_back(op); } + + std::vector& get_op_list() { return m_op_list; } + + bool get_next_op(Op& op) { + if (m_op_list_index < m_op_list.size()) { + op = m_op_list[m_op_list_index++]; + return true; + } + return false; + } + + void add_special_key(std::string const& key) { m_special_keys.push_back(key); } + + void begin_object() { + append_key(); + m_json_string += "{"; + } + + void begin_document() { m_json_string += "{"; } + + void end_document() { m_json_string[m_json_string.size() - 1] = '}'; } + + void end_object() { + if (m_op_list[m_op_list_index - 2] != BeginObject) { + m_json_string.pop_back(); + } + m_json_string += "},"; + } + + void append_key() { append_key(m_special_keys[m_special_keys_index++]); } + + void append_key(std::string const& key) { m_json_string += "\"" + key + "\":"; } + + void append_value(std::string const& value) { m_json_string += value + ","; } + + void append_value_with_quotes(std::string const& value) { + m_json_string += "\"" + value + "\","; + } + +private: + std::string m_json_string; + std::vector m_op_list; + std::vector m_special_keys; + + size_t m_op_list_index; + size_t m_special_keys_index; +}; + +#endif // CLP_S_JSONSERIALIZER_HPP diff --git a/components/core/src/clp_s/ParsedMessage.hpp b/components/core/src/clp_s/ParsedMessage.hpp new file mode 100644 index 000000000..769440778 --- /dev/null +++ b/components/core/src/clp_s/ParsedMessage.hpp @@ -0,0 +1,54 @@ +#ifndef CLP_S_PARSEDMESSAGE_HPP +#define CLP_S_PARSEDMESSAGE_HPP + +#include +#include +#include +#include + +namespace clp_s { +class ParsedMessage { +public: + // Constructor + ParsedMessage() : m_schema_id(-1) {} + + // Destructor + ~ParsedMessage() = default; + + void set_id(int32_t schema_id) { m_schema_id = schema_id; } + + /** + * Adds a value with different types to the message + * @param node_id + * @param value + */ + inline void add_value(int32_t node_id, int64_t value) { m_message[node_id] = value; } + + inline void add_value(int32_t node_id, double value) { m_message[node_id] = value; } + + inline void add_value(int32_t node_id, std::string const& value) { m_message[node_id] = value; } + + inline void add_value(int32_t node_id, bool value) { m_message[node_id] = value; } + + /** + * Clears the message + */ + void clear() { + m_schema_id = -1; + m_message.clear(); + } + + /** + * @return The content of the message + */ + std::map>& get_content() { + return m_message; + } + +private: + int32_t m_schema_id; + std::map> m_message; +}; +} // namespace clp_s + +#endif // CLP_S_PARSEDMESSAGE_HPP diff --git a/components/core/src/clp_s/ReaderUtils.cpp b/components/core/src/clp_s/ReaderUtils.cpp new file mode 100644 index 000000000..2b0d94d27 --- /dev/null +++ b/components/core/src/clp_s/ReaderUtils.cpp @@ -0,0 +1,231 @@ +#include "ReaderUtils.hpp" + +namespace clp_s { +std::shared_ptr ReaderUtils::read_schema_tree(std::string const& archives_dir) { + FileReader schema_tree_reader; + ZstdDecompressor schema_tree_decompressor; + + std::shared_ptr tree = std::make_shared(); + + schema_tree_reader.open(archives_dir + "/schema_tree"); + schema_tree_decompressor.open(schema_tree_reader, cDecompressorFileReadBufferCapacity); + + size_t num_nodes; + auto error_code = schema_tree_decompressor.try_read_numeric_value(num_nodes); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + for (size_t i = 0; i < num_nodes; i++) { + int32_t parent_id; + size_t key_length; + std::string key; + uint8_t node_type; + + error_code = schema_tree_decompressor.try_read_numeric_value(parent_id); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + error_code = schema_tree_decompressor.try_read_numeric_value(key_length); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + error_code = schema_tree_decompressor.try_read_string(key_length, key); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + error_code = schema_tree_decompressor.try_read_numeric_value(node_type); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + tree->add_node(parent_id, (NodeType)node_type, key); + } + + schema_tree_decompressor.close(); + schema_tree_reader.close(); + + return tree; +} + +std::shared_ptr ReaderUtils::get_variable_dictionary_reader( + std::string const& archive_path +) { + auto reader = std::make_shared(); + reader->open(archive_path + "/var.dict"); + return reader; +} + +std::shared_ptr ReaderUtils::get_log_type_dictionary_reader( + std::string const& archive_path +) { + auto reader = std::make_shared(); + reader->open(archive_path + "/log.dict"); + return reader; +} + +std::shared_ptr ReaderUtils::get_array_dictionary_reader( + std::string const& archive_path +) { + auto reader = std::make_shared(); + reader->open(archive_path + "/array.dict"); + return reader; +} + +std::shared_ptr ReaderUtils::read_schemas(std::string const& archives_dir) { + auto schemas_pointer = std::make_shared(); + SchemaMap& schemas = *schemas_pointer; + FileReader schema_id_reader; + ZstdDecompressor schema_id_decompressor; + + schema_id_reader.open(archives_dir + "/schema_ids"); + schema_id_decompressor.open(schema_id_reader, cDecompressorFileReadBufferCapacity); + + size_t schema_size; + auto error_code = schema_id_decompressor.try_read_numeric_value(schema_size); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + for (size_t i = 0; i < schema_size; i++) { + int32_t schema_id; + error_code = schema_id_decompressor.try_read_numeric_value(schema_id); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + size_t schema_node_size; + error_code = schema_id_decompressor.try_read_numeric_value(schema_node_size); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + std::set& schema = schemas[schema_id]; + for (size_t j = 0; j < schema_node_size; j++) { + int32_t node_id; + error_code = schema_id_decompressor.try_read_numeric_value(node_id); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + + schema.insert(node_id); + } + } + + schema_id_decompressor.close(); + schema_id_reader.close(); + + return schemas_pointer; +} + +std::shared_ptr ReaderUtils::read_timestamp_dictionary( + std::string const& archives_dir +) { + auto reader = std::make_shared(); + reader->open(archives_dir + "/timestamp.dict"); + reader->read_new_entries(); + reader->close(); + + return reader; +} + +std::shared_ptr ReaderUtils::read_local_timestamp_dictionary( + std::string const& archive_path +) { + auto reader = std::make_shared(); + reader->open(archive_path + "/timestamp.dict"); + reader->read_local_entries(); + reader->close(); + + return reader; +} + +std::vector ReaderUtils::get_archives(std::string const& archives_dir) { + std::vector archive_paths; + + if (false == boost::filesystem::is_directory(archives_dir)) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + boost::filesystem::directory_iterator iter(archives_dir); + boost::filesystem::directory_iterator end; + for (; iter != end; ++iter) { + if (boost::filesystem::is_directory(iter->path())) { + archive_paths.push_back(iter->path().string()); + } + } + + return archive_paths; +} + +std::vector ReaderUtils::get_schemas(std::string const& archive_path) { + std::vector schemas; + std::string encoded_messages_dir = archive_path + "/encoded_messages"; + + boost::filesystem::directory_iterator iter(encoded_messages_dir); + boost::filesystem::directory_iterator end; + + for (; iter != end; ++iter) { + if (boost::filesystem::is_regular_file(iter->path())) { + std::string schema = iter->path().rbegin()->string(); + if (false == schema.empty() && std::all_of(schema.begin(), schema.end(), ::isdigit)) { + schemas.push_back(std::stoi(schema)); + } + } + } + + return schemas; +} + +void ReaderUtils::append_reader_columns( + SchemaReader* reader, + std::set& columns, + std::shared_ptr const& schema_tree, + std::shared_ptr const& var_dict, + std::shared_ptr const& log_dict, + std::shared_ptr const& array_dict, + std::shared_ptr const& timestamp_dict +) { + for (int32_t column : columns) { + auto node = schema_tree->get_node(column); + std::string key_name = node->get_key_name(); + switch (node->get_type()) { + case NodeType::INTEGER: + reader->append_column(new Int64ColumnReader(key_name, column)); + break; + case NodeType::FLOAT: + reader->append_column(new FloatColumnReader(key_name, column)); + break; + case NodeType::CLPSTRING: + reader->append_column( + new ClpStringColumnReader(key_name, column, var_dict, log_dict) + ); + break; + case NodeType::VARSTRING: + reader->append_column(new VariableStringColumnReader(key_name, column, var_dict)); + break; + case NodeType::BOOLEAN: + reader->append_column(new BooleanColumnReader(key_name, column)); + break; + case NodeType::ARRAY: + reader->append_column( + new ClpStringColumnReader(key_name, column, var_dict, array_dict, true) + ); + break; + case NodeType::DATESTRING: + reader->append_column(new DateStringColumnReader(key_name, column, timestamp_dict)); + break; + case NodeType::FLOATDATESTRING: + reader->append_column(new FloatDateStringColumnReader(key_name, column)); + break; + case NodeType::OBJECT: + case NodeType::NULLVALUE: + reader->append_column(column); + break; + } + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ReaderUtils.hpp b/components/core/src/clp_s/ReaderUtils.hpp new file mode 100644 index 000000000..074dc056e --- /dev/null +++ b/components/core/src/clp_s/ReaderUtils.hpp @@ -0,0 +1,118 @@ +#ifndef CLP_S_READERUTILS_HPP +#define CLP_S_READERUTILS_HPP + +#include "DictionaryReader.hpp" +#include "SchemaReader.hpp" +#include "SchemaTree.hpp" +#include "TimestampDictionaryReader.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +class ReaderUtils { +public: + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + typedef std::map> SchemaMap; + static constexpr size_t cDecompressorFileReadBufferCapacity = 64 * 1024; // 64 KB + + /** + * Reads the schema tree from the given archive directory + * @param archives_dir + * @return The schema tree + */ + static std::shared_ptr read_schema_tree(std::string const& archives_dir); + + /** + * Opens and gets the variable dictionary reader for the given archive path + * @param archive_path + * @return the variable dictionary reader + */ + static std::shared_ptr get_variable_dictionary_reader( + std::string const& archive_path + ); + + /** + * Opens and gets the log type dictionary reader for the given archive path + * @param archive_path + * @return the log type dictionary reader + */ + static std::shared_ptr get_log_type_dictionary_reader( + std::string const& archive_path + ); + + /** + * Opens and gets the array dictionary reader for the given archive path + * @param archive_path + * @return the array dictionary reader + */ + static std::shared_ptr get_array_dictionary_reader( + std::string const& archive_path + ); + + /** + * Reads the schema map from the given archive directory + * @param archive_path + * @return the schema map + */ + static std::shared_ptr read_schemas(std::string const& archives_dir); + + /** + * Opens and gets the timestamp dictionary reader for the given archive path + * @param archive_path + * @return the timestamp dictionary reader + */ + static std::shared_ptr read_timestamp_dictionary( + std::string const& archives_dir + ); + + /** + * Opens and gets the local timestamp dictionary reader for the given archive path + * @param archive_path + * @return the timestamp dictionary reader + */ + static std::shared_ptr read_local_timestamp_dictionary( + std::string const& archive_path + ); + + /** + * Gets the list of archives in the given archive directory + * @param archives_dir + * @return the list of archives + */ + static std::vector get_archives(std::string const& archives_dir); + + /** + * Gets the list of schemas in the given archive + * @param archive_path + * @return the list of schemas + */ + static std::vector get_schemas(std::string const& archive_path); + + /** + * Append a set of columns to the given schema reader + * @param reader + * @param columns + * @param schema_tree + * @param var_dict + * @param log_dict + * @param array_dict + * @param timestamp_dict + */ + static void append_reader_columns( + SchemaReader* reader, + std::set& columns, + std::shared_ptr const& schema_tree, + std::shared_ptr const& var_dict, + std::shared_ptr const& log_dict, + std::shared_ptr const& array_dict, + std::shared_ptr const& timestamp_dict + ); +}; +} // namespace clp_s + +#endif // CLP_S_READERUTILS_HPP diff --git a/components/core/src/clp_s/SchemaMap.cpp b/components/core/src/clp_s/SchemaMap.cpp new file mode 100644 index 000000000..65af81feb --- /dev/null +++ b/components/core/src/clp_s/SchemaMap.cpp @@ -0,0 +1,37 @@ +#include "SchemaMap.hpp" + +#include "FileWriter.hpp" +#include "ZstdCompressor.hpp" + +namespace clp_s { +int32_t SchemaMap::add_schema(std::set& schema) { + auto schema_it = m_schema_map.find(schema); + if (schema_it != m_schema_map.end()) { + return schema_it->second; + } else { + m_schema_map[schema] = m_current_schema_id; + return m_current_schema_id++; + } +} + +void SchemaMap::store() { + FileWriter schema_map_writer; + ZstdCompressor schema_map_compressor; + + // TODO: rename schema_ids -> schema_map, and use int32_t for schema size + schema_map_writer.open(m_archives_dir + "/schema_ids", FileWriter::OpenMode::CreateForWriting); + schema_map_compressor.open(schema_map_writer, m_compression_level); + schema_map_compressor.write_numeric_value(m_schema_map.size()); + for (auto const& schema_mapping : m_schema_map) { + auto const& schema = schema_mapping.first; + schema_map_compressor.write_numeric_value(schema_mapping.second); + schema_map_compressor.write_numeric_value(schema.size()); + for (int32_t mst_node_id : schema) { + schema_map_compressor.write_numeric_value(mst_node_id); + } + } + + schema_map_compressor.close(); + schema_map_writer.close(); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/SchemaMap.hpp b/components/core/src/clp_s/SchemaMap.hpp new file mode 100644 index 000000000..f1cb9a087 --- /dev/null +++ b/components/core/src/clp_s/SchemaMap.hpp @@ -0,0 +1,48 @@ +#ifndef CLP_S_SCHEMAMAP_HPP +#define CLP_S_SCHEMAMAP_HPP + +#include +#include +#include + +namespace clp_s { +class SchemaMap { +public: + typedef std::map, int32_t> schema_map_t; + + // Constructor + explicit SchemaMap(std::string const& archives_dir, int compression_level) + : m_archives_dir(archives_dir), + m_compression_level(compression_level), + m_current_schema_id(0) {} + + /** + * Return a schema's Id and add the schema to the + * schema map if it does not already exist. + * @param schema + * @return the Id of the schema + */ + int32_t add_schema(std::set& schema); + + /** + * Write the contents of the SchemaMap to archives_dir/schema_ids + */ + void store(); + + /** + * Get const iterators into the schema map + * @return const it to the schema map + */ + schema_map_t::const_iterator schema_map_begin() const { return m_schema_map.cbegin(); } + + schema_map_t::const_iterator schema_map_end() const { return m_schema_map.cend(); } + +private: + std::string m_archives_dir; + int m_compression_level; + int32_t m_current_schema_id; + schema_map_t m_schema_map; +}; +} // namespace clp_s + +#endif // CLP_S_SCHEMAMAP_HPP diff --git a/components/core/src/clp_s/SchemaReader.cpp b/components/core/src/clp_s/SchemaReader.cpp new file mode 100644 index 000000000..b55f6feae --- /dev/null +++ b/components/core/src/clp_s/SchemaReader.cpp @@ -0,0 +1,288 @@ +#include "SchemaReader.hpp" + +namespace clp_s { +void SchemaReader::open(std::string path) { + m_path = std::move(path); + m_local_schema_tree = std::make_shared(); +} + +void SchemaReader::close() { + for (auto& i : m_columns) { + delete i; + } + + m_column_map.clear(); + m_global_id_to_local_id.clear(); +} + +void SchemaReader::append_column(BaseColumnReader* column_reader) { + m_column_map[column_reader->get_id()] = column_reader; + m_columns.push_back(column_reader); + generate_local_tree(column_reader->get_id()); +} + +void SchemaReader::append_column(int32_t id) { + generate_local_tree(id); +} + +void SchemaReader::load() { + constexpr size_t cDecompressorFileReadBufferCapacity = 64 * 1024; // 64 KB + + m_file_reader.open(m_path); + m_decompressor.open(m_file_reader, cDecompressorFileReadBufferCapacity); + + m_file_reader.seek_from_begin(0); + m_file_reader.read_numeric_value(m_num_messages, false); + + for (auto& reader : m_columns) { + reader->load(m_decompressor, m_num_messages); + } + + m_decompressor.close(); + m_file_reader.close(); + + generate_json_template(0); +} + +bool SchemaReader::get_next_message(std::string& message) { + if (m_cur_message >= m_num_messages) { + return false; + } + + m_json_serializer->reset(); + m_json_serializer->begin_document(); + size_t column_id_index = 0; + BaseColumnReader* column; + JsonSerializer::Op op; + while (m_json_serializer->get_next_op(op)) { + switch (op) { + case JsonSerializer::Op::BeginObject: { + m_json_serializer->begin_object(); + break; + } + case JsonSerializer::Op::EndObject: { + m_json_serializer->end_object(); + break; + } + case JsonSerializer::Op::AddIntField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::to_string(std::get(column->extract_value(m_cur_message))) + ); + break; + } + case JsonSerializer::Op::AddFloatField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::to_string(std::get(column->extract_value(m_cur_message))) + ); + break; + } + case JsonSerializer::Op::AddBoolField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::get(column->extract_value(m_cur_message)) != 0 ? "true" + : "false" + ); + break; + } + case JsonSerializer::Op::AddStringField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value_with_quotes( + std::get(column->extract_value(m_cur_message)) + ); + break; + } + case JsonSerializer::Op::AddArrayField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::get(column->extract_value(m_cur_message)) + ); + break; + } + case JsonSerializer::Op::AddNullField: { + m_json_serializer->append_key(); + m_json_serializer->append_value("null"); + break; + } + } + } + + m_json_serializer->end_document(); + + message = m_json_serializer->get_serialized_string(); + + if (message.back() != '\n') { + message += '\n'; + } + + m_cur_message++; + return true; +} + +bool SchemaReader::get_next_message(std::string& message, FilterClass* filter) { + while (m_cur_message < m_num_messages) { + if (false == filter->filter(m_cur_message, m_extracted_values)) { + m_cur_message++; + continue; + } + + m_json_serializer->reset(); + m_json_serializer->begin_document(); + size_t column_id_index = 0; + BaseColumnReader* column; + JsonSerializer::Op op; + while (m_json_serializer->get_next_op(op)) { + switch (op) { + case JsonSerializer::Op::BeginObject: { + m_json_serializer->begin_object(); + break; + } + case JsonSerializer::Op::EndObject: { + m_json_serializer->end_object(); + break; + } + case JsonSerializer::Op::AddIntField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::to_string(std::get(m_extracted_values[column->get_id()])) + ); + break; + } + case JsonSerializer::Op::AddFloatField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::to_string(std::get(m_extracted_values[column->get_id()])) + ); + break; + } + case JsonSerializer::Op::AddBoolField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::get(m_extracted_values[column->get_id()]) != 0 ? "true" + : "false" + ); + break; + } + case JsonSerializer::Op::AddStringField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value_with_quotes( + std::get(m_extracted_values[column->get_id()]) + ); + break; + } + case JsonSerializer::Op::AddArrayField: { + column = m_reordered_columns[column_id_index++]; + m_json_serializer->append_key(column->get_name()); + m_json_serializer->append_value( + std::get(m_extracted_values[column->get_id()]) + ); + break; + } + case JsonSerializer::Op::AddNullField: { + m_json_serializer->append_key(); + m_json_serializer->append_value("null"); + break; + } + } + } + + m_json_serializer->end_document(); + + message = m_json_serializer->get_serialized_string(); + + if (message.back() != '\n') { + message += '\n'; + } + + m_cur_message++; + return true; + } + + return false; +} + +void SchemaReader::initialize_filter(FilterClass* filter) { + filter->init(this, m_schema_id, m_column_map); +} + +void SchemaReader::generate_local_tree(int32_t global_id) { + auto node = m_global_schema_tree->get_node(global_id); + int32_t parent_id = node->get_parent_id(); + + if (parent_id != -1 && m_global_id_to_local_id.find(parent_id) == m_global_id_to_local_id.end()) + { + generate_local_tree(parent_id); + } + + int32_t local_id = m_local_schema_tree->add_node( + parent_id == -1 ? -1 : m_global_id_to_local_id[parent_id], + node->get_type(), + node->get_key_name() + ); + m_global_id_to_local_id[global_id] = local_id; + m_local_id_to_global_id[local_id] = global_id; +} + +void SchemaReader::generate_json_template(int32_t id) { + auto node = m_local_schema_tree->get_node(id); + auto children_ids = node->get_children_ids(); + + for (int32_t child_id : children_ids) { + int32_t child_global_id = m_local_id_to_global_id[child_id]; + auto child_node = m_local_schema_tree->get_node(child_id); + std::string const& key = child_node->get_key_name(); + switch (child_node->get_type()) { + case NodeType::OBJECT: { + m_json_serializer->add_op(JsonSerializer::Op::BeginObject); + m_json_serializer->add_special_key(key); + generate_json_template(child_id); + m_json_serializer->add_op(JsonSerializer::Op::EndObject); + break; + } + case NodeType::ARRAY: { + m_json_serializer->add_op(JsonSerializer::Op::AddArrayField); + m_reordered_columns.push_back(m_column_map[child_global_id]); + break; + } + case NodeType::INTEGER: { + m_json_serializer->add_op(JsonSerializer::Op::AddIntField); + m_reordered_columns.push_back(m_column_map[child_global_id]); + break; + } + case NodeType::FLOAT: { + m_json_serializer->add_op(JsonSerializer::Op::AddFloatField); + m_reordered_columns.push_back(m_column_map[child_global_id]); + break; + } + case NodeType::BOOLEAN: { + m_json_serializer->add_op(JsonSerializer::Op::AddBoolField); + m_reordered_columns.push_back(m_column_map[child_global_id]); + break; + } + case NodeType::CLPSTRING: + case NodeType::VARSTRING: + case NodeType::DATESTRING: + case NodeType::FLOATDATESTRING: { + m_json_serializer->add_op(JsonSerializer::Op::AddStringField); + m_reordered_columns.push_back(m_column_map[child_global_id]); + break; + } + case NodeType::NULLVALUE: { + m_json_serializer->add_op(JsonSerializer::Op::AddNullField); + m_json_serializer->add_special_key(key); + break; + } + } + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/SchemaReader.hpp b/components/core/src/clp_s/SchemaReader.hpp new file mode 100644 index 000000000..87ac549ec --- /dev/null +++ b/components/core/src/clp_s/SchemaReader.hpp @@ -0,0 +1,153 @@ +#ifndef CLP_S_SCHEMAREADER_HPP +#define CLP_S_SCHEMAREADER_HPP + +#include +#include +#include +#include + +#include + +#include "ColumnReader.hpp" +#include "FileReader.hpp" +#include "JsonSerializer.hpp" +#include "SchemaTree.hpp" +#include "ZstdDecompressor.hpp" + +namespace clp_s { +class SchemaReader; + +class FilterClass { +public: + /** + * Initializes the filter + * @param reader + * @param schema_id + * @param columns + */ + virtual void init( + SchemaReader* reader, + int32_t schema_id, + std::unordered_map& columns + ) = 0; + + /** + * Filters the message + * @param cur_message + * @param extracted_values + * @return true if the message is accepted + */ + virtual bool filter( + uint64_t cur_message, + std::map>& extracted_values + ) = 0; +}; + +class SchemaReader { +public: + // Constructor + explicit SchemaReader(std::shared_ptr schema_tree, int32_t schema_id) + : m_num_messages(0), + m_cur_message(0), + m_global_schema_tree(std::move(schema_tree)), + m_schema_id(schema_id), + m_json_serializer(std::make_shared()) {} + + // Destructor + ~SchemaReader() = default; + + /** + * Opens the scheam file + * @param path + */ + void open(std::string path); + + /** + * Closes the schema file + */ + void close(); + + /** + * Appends a column to the schema reader + * @param column_reader + */ + void append_column(BaseColumnReader* column_reader); + + /** + * Appends a column to the schema reader + * @param id + */ + void append_column(int32_t id); + + /** + * Loads the encoded messages + */ + void load(); + + /** + * Gets next message + * @param message + * @return true if there is a next message + */ + bool get_next_message(std::string& message); + + /** + * Gets next message with a filter + * @param message + * @param filter + * @return true if there is a next message + */ + bool get_next_message(std::string& message, FilterClass* filter); + + /** + * Initializes the filter + * @param filter + */ + void initialize_filter(FilterClass* filter); + +private: + /** + * Generates a local schema tree + * @param global_id + */ + void generate_local_tree(int32_t global_id); + + /** + * Generates a json template + * @param object + * @param id + * @param json_pointer + */ + void generate_json_template(int32_t id); + + /** + * Gets a json pointer string + * @param s + * @return + */ + static std::string get_json_pointer_string(std::string const& s); + + int32_t m_schema_id; + std::string m_path; + uint64_t m_num_messages; + uint64_t m_cur_message; + + FileReader m_file_reader; + ZstdDecompressor m_decompressor; + + std::unordered_map m_column_map; + std::vector m_columns; + std::vector m_reordered_columns; + + std::shared_ptr m_global_schema_tree; + std::shared_ptr m_local_schema_tree; + std::unordered_map m_global_id_to_local_id; + std::unordered_map m_local_id_to_global_id; + + std::shared_ptr m_json_serializer; + + std::map> m_extracted_values; +}; +} // namespace clp_s + +#endif // CLP_S_SCHEMAREADER_HPP diff --git a/components/core/src/clp_s/SchemaTree.cpp b/components/core/src/clp_s/SchemaTree.cpp new file mode 100644 index 000000000..36527f335 --- /dev/null +++ b/components/core/src/clp_s/SchemaTree.cpp @@ -0,0 +1,25 @@ +#include "SchemaTree.hpp" + +namespace clp_s { +int32_t SchemaTree::add_node(int32_t parent_node_id, NodeType type, std::string const& key) { + std::tuple node_key = {parent_node_id, key, type}; + auto node_it = m_node_map.find(node_key); + if (node_it != m_node_map.end()) { + auto node_id = node_it->second; + m_nodes[node_id]->increase_count(); + return node_id; + } + + auto node = std::make_shared(parent_node_id, m_nodes.size(), key, type); + node->increase_count(); + m_nodes.push_back(node); + int32_t node_id = node->get_id(); + if (parent_node_id >= 0) { + auto parent_node = m_nodes[parent_node_id]; + parent_node->add_child(node_id); + } + m_node_map[node_key] = node_id; + + return node_id; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/SchemaTree.hpp b/components/core/src/clp_s/SchemaTree.hpp new file mode 100644 index 000000000..178f80f7c --- /dev/null +++ b/components/core/src/clp_s/SchemaTree.hpp @@ -0,0 +1,99 @@ +#ifndef CLP_S_SCHEMATREE_HPP +#define CLP_S_SCHEMATREE_HPP + +#include +#include +#include +#include +#include + +#include + +namespace clp_s { +enum class NodeType : uint8_t { + INTEGER, + FLOAT, + CLPSTRING, + VARSTRING, + BOOLEAN, + OBJECT, + ARRAY, + NULLVALUE, + DATESTRING, + FLOATDATESTRING +}; + +class SchemaNode { +public: + // Constructor + SchemaNode() : m_parent_id(-1), m_id(-1), m_type(NodeType::INTEGER), m_count(0) {} + + SchemaNode(int32_t parent_id, int32_t id, std::string key_name, NodeType type) + : m_parent_id(parent_id), + m_id(id), + m_key_name(std::move(key_name)), + m_type(type), + m_count(0) {} + + /** + * Getters + */ + int32_t get_id() const { return m_id; } + + int32_t get_parent_id() const { return m_parent_id; } + + std::vector& get_children_ids() { return m_children_ids; } + + NodeType get_type() const { return m_type; } + + std::string const& get_key_name() const { return m_key_name; } + + int32_t get_count() const { return m_count; } + + /** + * Increases the count of this node by 1 + */ + void increase_count() { m_count++; } + + /** + * Adds a child node to this node + * @param child_id + */ + void add_child(int32_t child_id) { m_children_ids.push_back(child_id); } + +private: + int32_t m_id; + int32_t m_parent_id; + std::vector m_children_ids; + std::string m_key_name; + NodeType m_type; + int32_t m_count; +}; + +class SchemaTree { +public: + SchemaTree() = default; + + int32_t add_node(int parent_node_id, NodeType type, std::string const& key); + + bool has_node(int32_t id) { return id < m_nodes.size() && id >= 0; } + + std::shared_ptr get_node(int32_t id) { + if (id >= m_nodes.size() || id < 0) { + throw std::invalid_argument("invalid access of id " + std::to_string(id)); + } + + return m_nodes[id]; + } + + int32_t get_root_node_id() { return m_nodes[0]->get_id(); } + + std::vector> get_nodes() { return m_nodes; } + +private: + std::vector> m_nodes; + absl::flat_hash_map, int32_t> m_node_map; +}; +} // namespace clp_s + +#endif // CLP_S_SCHEMATREE_HPP diff --git a/components/core/src/clp_s/SchemaWriter.cpp b/components/core/src/clp_s/SchemaWriter.cpp new file mode 100644 index 000000000..56357874f --- /dev/null +++ b/components/core/src/clp_s/SchemaWriter.cpp @@ -0,0 +1,56 @@ +#include "SchemaWriter.hpp" + +#include + +namespace clp_s { +void SchemaWriter::open(std::string path, int compression_level) { + m_path = std::move(path); + m_compression_level = compression_level; +} + +void SchemaWriter::close() { + m_compressor.close(); + m_file_writer.close(); + + for (auto i : m_columns) { + delete i; + } + + m_columns.clear(); +} + +void SchemaWriter::append_column(BaseColumnWriter* column_writer) { + m_columns.push_back(column_writer); +} + +size_t SchemaWriter::append_message(ParsedMessage& message) { + int count = 0; + size_t size, total_size; + size = total_size = 0; + for (auto& i : message.get_content()) { + m_columns[count]->add_value(i.second, size); + total_size += size; + count++; + } + + m_num_messages++; + return total_size; +} + +void SchemaWriter::store() { + m_file_writer.open(m_path, FileWriter::OpenMode::CreateForWriting); + m_file_writer.write_numeric_value(m_num_messages); + m_compressor.open(m_file_writer, m_compression_level); + + for (auto& writer : m_columns) { + writer->store(m_compressor); + // compressor_.Write(writer->GetData(), writer->GetSize()); + } +} + +SchemaWriter::~SchemaWriter() { + for (auto i : m_columns) { + delete i; + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/SchemaWriter.hpp b/components/core/src/clp_s/SchemaWriter.hpp new file mode 100644 index 000000000..edf3320d8 --- /dev/null +++ b/components/core/src/clp_s/SchemaWriter.hpp @@ -0,0 +1,61 @@ +#ifndef CLP_S_SCHEMAWRITER_HPP +#define CLP_S_SCHEMAWRITER_HPP + +#include + +#include "ColumnWriter.hpp" +#include "FileWriter.hpp" +#include "ParsedMessage.hpp" +#include "ZstdCompressor.hpp" + +namespace clp_s { +class SchemaWriter { +public: + // Constructor + SchemaWriter() : m_num_messages(0) {} + + // Destructor + ~SchemaWriter(); + + /** + * Opens the schema writer. + * @param path + * @param compression_level + */ + void open(std::string path, int compression_level); + + /** + * Appends a column to the schema writer. + * @param column_writer + */ + void append_column(BaseColumnWriter* column_writer); + + /** + * Appends a message to the schema writer. + * @param message + * @return The size of the message in bytes. + */ + size_t append_message(ParsedMessage& message); + + /** + * Stores the schema to disk. + */ + void store(); + + /** + * Closes the schema writer. + */ + void close(); + +private: + FileWriter m_file_writer; + ZstdCompressor m_compressor; + std::string m_path; + int m_compression_level{}; + uint64_t m_num_messages; + + std::vector m_columns; +}; +} // namespace clp_s + +#endif // CLP_S_SCHEMAWRITER_HPP diff --git a/components/core/src/clp_s/TimestampDictionaryReader.cpp b/components/core/src/clp_s/TimestampDictionaryReader.cpp new file mode 100644 index 000000000..09dfe65fd --- /dev/null +++ b/components/core/src/clp_s/TimestampDictionaryReader.cpp @@ -0,0 +1,91 @@ +#include "TimestampDictionaryReader.hpp" + +#include "Utils.hpp" + +namespace clp_s { +void TimestampDictionaryReader::open(std::string const& dictionary_path) { + if (m_is_open) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + constexpr size_t cDecompressorFileReadBufferCapacity = 16 * 1024; // 16 KB + + m_dictionary_file_reader.open(dictionary_path); + m_dictionary_decompressor.open(m_dictionary_file_reader, cDecompressorFileReadBufferCapacity); + + m_is_open = true; +} + +void TimestampDictionaryReader::close() { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + m_dictionary_decompressor.close(); + m_dictionary_file_reader.close(); +} + +void TimestampDictionaryReader::read_local_entries() { + read_new_entries(/*local=*/true); +} + +void TimestampDictionaryReader::read_new_entries(bool local) { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + ErrorCode error; + + uint64_t range_index_size; + error = m_dictionary_decompressor.try_read_numeric_value(range_index_size); + if (ErrorCodeSuccess != error) { + throw OperationFailed(error, __FILENAME__, __LINE__); + } + + for (int i = 0; i < range_index_size; ++i) { + std::string col; + TimestampEntry entry; + entry.try_read_from_file(m_dictionary_decompressor, col); + TimestampEntry& e = m_column_to_range[col] = entry; + std::vector tokens; + StringUtils::tokenize_column_descriptor(col, tokens); + m_tokenized_column_to_range.emplace_back(std::move(tokens), &e); + } + + // Local timestamp dictionaries only contain range indices, and + // not patterns. Exit early here. + if (local) { + return; + } + + uint64_t num_patterns; + error = m_dictionary_decompressor.try_read_numeric_value(num_patterns); + if (ErrorCodeSuccess != error) { + throw OperationFailed(error, __FILENAME__, __LINE__); + } + for (int i = 0; i < num_patterns; ++i) { + uint64_t id, pattern_len; + std::string pattern; + error = m_dictionary_decompressor.try_read_numeric_value(id); + if (ErrorCodeSuccess != error) { + throw OperationFailed(error, __FILENAME__, __LINE__); + } + error = m_dictionary_decompressor.try_read_numeric_value(pattern_len); + if (ErrorCodeSuccess != error) { + throw OperationFailed(error, __FILENAME__, __LINE__); + } + error = m_dictionary_decompressor.try_read_string(pattern_len, pattern); + if (ErrorCodeSuccess != error) { + throw OperationFailed(error, __FILENAME__, __LINE__); + } + m_patterns[id] = TimestampPattern(0, pattern); + } +} + +std::string TimestampDictionaryReader::get_string_encoding(epochtime_t epoch, uint64_t format_id) { + std::string ret; + m_patterns[format_id].insert_formatted_timestamp(epoch, ret); + + return ret; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/TimestampDictionaryReader.hpp b/components/core/src/clp_s/TimestampDictionaryReader.hpp new file mode 100644 index 000000000..d6f38743c --- /dev/null +++ b/components/core/src/clp_s/TimestampDictionaryReader.hpp @@ -0,0 +1,99 @@ +#ifndef CLP_S_TIMESTAMPDICTIONARYREADER_HPP +#define CLP_S_TIMESTAMPDICTIONARYREADER_HPP + +#include + +#include "FileReader.hpp" +#include "search/FilterOperation.hpp" +#include "TimestampEntry.hpp" +#include "TimestampPattern.hpp" +#include "ZstdDecompressor.hpp" + +namespace clp_s { +class TimestampDictionaryReader { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + TimestampDictionaryReader() : m_is_open(false) {} + + // Methods + /** + * Opens dictionary for reading + * @param dictionary_path + */ + void open(std::string const& dictionary_path); + + /** + * Closes the dictionary + */ + void close(); + + /** + * Reads any new entries from disk + */ + void read_new_entries(bool local = false); + + /** + * Reads new entries from a *local* timestamp dictionary + * + * Local timestamp dictionaries contain only range indices, + * and have no timestamp pattern mappings + */ + void read_local_entries(); + + /** + * Gets the string encoding for a given epoch and format ID + * @param epoch + * @param format_id + */ + std::string get_string_encoding(epochtime_t epoch, uint64_t format_id); + + typedef std::map::iterator id_to_pattern_iterator_t; + typedef std::vector, TimestampEntry*>>::iterator + tokenized_column_to_range_it_t; + + /** + * Gets iterators for the timestamp patterns + * @return begin and end iterators for the timestamp patterns + */ + id_to_pattern_iterator_t pattern_begin() { return m_patterns.begin(); } + + id_to_pattern_iterator_t pattern_end() { return m_patterns.end(); } + + /** + * Gets iterators for the column to range mappings + * @return begin and end iterators for the column to range mappings + */ + tokenized_column_to_range_it_t tokenized_column_to_range_begin() { + return m_tokenized_column_to_range.begin(); + } + + tokenized_column_to_range_it_t tokenized_column_to_range_end() { + return m_tokenized_column_to_range.end(); + } + +private: + typedef std::map id_to_pattern_t; + typedef std::map column_to_range_t; + typedef std::vector, TimestampEntry*>> + tokenized_column_to_range_t; + + // Variables + bool m_is_open; + FileReader m_dictionary_file_reader; + ZstdDecompressor m_dictionary_decompressor; + + id_to_pattern_t m_patterns; + column_to_range_t m_column_to_range; + tokenized_column_to_range_t m_tokenized_column_to_range; +}; +} // namespace clp_s + +#endif // CLP_S_TIMESTAMPDICTIONARYREADER_HPP diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp new file mode 100644 index 000000000..32365049f --- /dev/null +++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp @@ -0,0 +1,146 @@ +#include "TimestampDictionaryWriter.hpp" + +#include "Utils.hpp" + +namespace clp_s { +void TimestampDictionaryWriter::write_timestamp_entries( + std::map const& ranges, + ZstdCompressor& compressor +) { + compressor.write_numeric_value(ranges.size()); + + for (auto const& range : ranges) { + range.second.write_to_file(compressor, range.first); + } +} + +void TimestampDictionaryWriter::write_and_flush_to_disk() { + write_timestamp_entries(m_global_column_to_range, m_dictionary_compressor); + + m_dictionary_compressor.write_numeric_value(m_pattern_to_id.size()); + for (auto& it : m_pattern_to_id) { + // write pattern ID + m_dictionary_compressor.write_numeric_value(it.second); + + std::string const& pattern = it.first->get_format(); + m_dictionary_compressor.write_numeric_value(pattern.length()); + m_dictionary_compressor.write_string(pattern); + } + + m_dictionary_compressor.flush(); + m_dictionary_file_writer.flush(); +} + +void TimestampDictionaryWriter::write_local_and_flush_to_disk() { + write_timestamp_entries(m_local_column_to_range, m_dictionary_compressor_local); + + m_dictionary_compressor_local.flush(); + m_dictionary_file_writer_local.flush(); +} + +void TimestampDictionaryWriter::open(std::string const& dictionary_path, int compression_level) { + if (m_is_open) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + m_dictionary_file_writer.open(dictionary_path, FileWriter::OpenMode::CreateForWriting); + m_dictionary_compressor.open(m_dictionary_file_writer, compression_level); + + m_next_id = 0; + m_is_open = true; +} + +void TimestampDictionaryWriter::open_local( + std::string const& dictionary_path, + int compression_level +) { + if (m_is_open_local) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + m_dictionary_file_writer_local.open(dictionary_path, FileWriter::OpenMode::CreateForWriting); + m_dictionary_compressor_local.open(m_dictionary_file_writer_local, compression_level); + + m_is_open_local = true; +} + +void TimestampDictionaryWriter::close() { + if (false == m_is_open) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + // merge before writing overall archive because this + // happens before the last sub-archive is written + merge_local_range(); + write_and_flush_to_disk(); + m_dictionary_compressor.close(); + m_dictionary_file_writer.close(); + + m_is_open = false; +} + +void TimestampDictionaryWriter::close_local() { + if (false == m_is_open_local) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + write_local_and_flush_to_disk(); + m_dictionary_compressor_local.close(); + m_dictionary_file_writer_local.close(); + + m_is_open_local = false; + + // merge after every sub-archive + merge_local_range(); + m_local_column_to_range.clear(); +} + +uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* pattern) { + if (0 == m_pattern_to_id.count(pattern)) { + uint64_t id = m_next_id++; + m_pattern_to_id[pattern] = id; + + return id; + } + + return m_pattern_to_id.at(pattern); +} + +epochtime_t TimestampDictionaryWriter::ingest_entry( + std::string const& key, + std::string const& timestamp, + uint64_t& id +) { + epochtime_t ret; + size_t timestamp_begin_pos = 0, timestamp_end_pos = 0; + TimestampPattern const* pattern = TimestampPattern::search_known_ts_patterns( + timestamp, + ret, + timestamp_begin_pos, + timestamp_end_pos + ); + m_local_column_to_range[key].ingest_timestamp(ret); + + if (pattern == nullptr) { + throw OperationFailed(ErrorCodeFailure, __FILE__, __LINE__); + } + + id = get_pattern_id(pattern); + + return ret; +} + +void TimestampDictionaryWriter::ingest_entry(std::string const& key, double timestamp) { + m_local_column_to_range[key].ingest_timestamp(timestamp); +} + +void TimestampDictionaryWriter::ingest_entry(std::string const& key, int64_t timestamp) { + m_local_column_to_range[key].ingest_timestamp(timestamp); +} + +void TimestampDictionaryWriter::merge_local_range() { + for (auto const& it : m_local_column_to_range) { + m_global_column_to_range[it.first].merge_range(it.second); + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp new file mode 100644 index 000000000..c810e675e --- /dev/null +++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp @@ -0,0 +1,95 @@ +#ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP +#define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP + +#include +#include +#include + +#include "FileWriter.hpp" +#include "TimestampEntry.hpp" +#include "TimestampPattern.hpp" +#include "ZstdCompressor.hpp" + +namespace clp_s { +class TimestampDictionaryWriter { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructors + TimestampDictionaryWriter() : m_is_open(false), m_is_open_local(false) {} + + /** + * Opens the global timestamp dictionary for writing + * @param dictionary_path + * @param compression_level + */ + void open(std::string const& dictionary_path, int compression_level); + + /** + * Opens a local timestamp dictionary for writing + * @param dictionary_path + * @param compression_level + */ + void open_local(std::string const& dictionary_path, int compression_level); + + /** + * Closes the global timestamp dictionary + */ + void close(); + + /** + * Closes the local timestamp dictionary + */ + void close_local(); + + /** + * Writes the global timestamp dictionary to disk + */ + void write_and_flush_to_disk(); + + /** + * Writes the local timestamp dictionary to disk + */ + void write_local_and_flush_to_disk(); + + uint64_t get_pattern_id(TimestampPattern const* pattern); + + epochtime_t ingest_entry(std::string const& key, std::string const& timestamp, uint64_t& id); + + void ingest_entry(std::string const& key, double timestamp); + + void ingest_entry(std::string const& key, int64_t timestamp); + +private: + void merge_local_range(); + static void write_timestamp_entries( + std::map const& ranges, + ZstdCompressor& compressor + ); + + typedef std::unordered_map pattern_to_id_t; + + // Variables + bool m_is_open; + bool m_is_open_local; + + // Variables related to on-disk storage + FileWriter m_dictionary_file_writer; + ZstdCompressor m_dictionary_compressor; + FileWriter m_dictionary_file_writer_local; + ZstdCompressor m_dictionary_compressor_local; + + pattern_to_id_t m_pattern_to_id; + uint64_t m_next_id{}; + std::map m_global_column_to_range; + std::map m_local_column_to_range; +}; +} // namespace clp_s + +#endif // CLP_S_TIMESTAMPDICTIONARYWRITER_HPP diff --git a/components/core/src/clp_s/TimestampEntry.cpp b/components/core/src/clp_s/TimestampEntry.cpp new file mode 100644 index 000000000..cbc6515d6 --- /dev/null +++ b/components/core/src/clp_s/TimestampEntry.cpp @@ -0,0 +1,345 @@ +#include "TimestampEntry.hpp" + +#include + +namespace clp_s { +void TimestampEntry::ingest_timestamp(epochtime_t timestamp) { + if (m_encoding == DoubleEpoch) { + if (timestamp < std::ceil(m_epoch_start_double)) { + m_epoch_start_double = timestamp; + } + if (timestamp > std::floor(m_epoch_end_double)) { + m_epoch_end_double = timestamp; + } + + return; + } + + if (m_encoding == UnkownTimestampEncoding) { + m_encoding = Epoch; + } + + if (timestamp < m_epoch_start) { + m_epoch_start = timestamp; + } + if (timestamp > m_epoch_end) { + m_epoch_end = timestamp; + } +} + +void TimestampEntry::ingest_timestamp(double timestamp) { + if (m_encoding == UnkownTimestampEncoding) { + m_encoding = DoubleEpoch; + } else if (m_encoding == Epoch) { + m_encoding = DoubleEpoch; + m_epoch_start_double = m_epoch_start; + m_epoch_end_double = m_epoch_end; + } + + if (timestamp < m_epoch_start_double) { + m_epoch_start_double = timestamp; + } + if (timestamp > m_epoch_end_double) { + m_epoch_end_double = timestamp; + } +} + +void TimestampEntry::merge_range(TimestampEntry const& entry) { + if (entry.m_encoding == Epoch) { + ingest_timestamp(entry.m_epoch_start); + ingest_timestamp(entry.m_epoch_end); + } else if (entry.m_encoding == DoubleEpoch) { + ingest_timestamp(entry.m_epoch_start_double); + ingest_timestamp(entry.m_epoch_end_double); + } +} + +void TimestampEntry::write_to_file(ZstdCompressor& compressor, std::string const& column) const { + compressor.write_numeric_value(column.length()); + compressor.write_string(column); + + compressor.write_numeric_value(m_encoding); + + if (m_encoding == Epoch) { + compressor.write_numeric_value(m_epoch_start); + compressor.write_numeric_value(m_epoch_end); + } else if (m_encoding == DoubleEpoch) { + compressor.write_numeric_value(m_epoch_start_double); + compressor.write_numeric_value(m_epoch_end_double); + } +} + +ErrorCode TimestampEntry::try_read_from_file(ZstdDecompressor& decompressor, std::string& column) { + ErrorCode error_code; + + uint64_t column_len; + error_code = decompressor.try_read_numeric_value(column_len); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + error_code = decompressor.try_read_string(column_len, column); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + + uint64_t encoding; + error_code = decompressor.try_read_numeric_value(m_encoding); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + + if (m_encoding == Epoch) { + error_code = decompressor.try_read_numeric_value(m_epoch_start); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + error_code = decompressor.try_read_numeric_value(m_epoch_end); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + } else if (m_encoding == DoubleEpoch) { + error_code = decompressor.try_read_numeric_value(m_epoch_start_double); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + error_code = decompressor.try_read_numeric_value(m_epoch_end_double); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + } + + return error_code; +} + +void TimestampEntry::read_from_file(ZstdDecompressor& decompressor, std::string& column) { + auto error_code = try_read_from_file(decompressor, column); + if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } +} + +EvaluatedValue TimestampEntry::evaluate_filter(FilterOperation op, double timestamp) { + if (op == FilterOperation::EXISTS || op == FilterOperation::NEXISTS) { + return EvaluatedValue::Unknown; + } + + if (m_encoding == DoubleEpoch) { + switch (op) { + case FilterOperation::EQ: + if (timestamp >= m_epoch_start_double && timestamp <= m_epoch_end_double) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::False; + } + case FilterOperation::NEQ: + if (timestamp >= m_epoch_start_double && timestamp <= m_epoch_end_double) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::True; + } + case FilterOperation::LT: + if (timestamp > m_epoch_end_double) { + return EvaluatedValue::True; + } else if (timestamp <= m_epoch_start_double) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::LTE: + if (timestamp >= m_epoch_end_double) { + return EvaluatedValue::True; + } else if (timestamp < m_epoch_start_double) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GT: + if (timestamp < m_epoch_start_double) { + return EvaluatedValue::True; + } else if (timestamp >= m_epoch_end_double) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GTE: + if (timestamp <= m_epoch_start_double) { + return EvaluatedValue::True; + } else if (timestamp > m_epoch_end_double) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + default: + return EvaluatedValue::Unknown; + } + } else if (m_encoding == Epoch) { + double epoch_start_tmp = m_epoch_start, epoch_end_tmp = m_epoch_end; + switch (op) { + case FilterOperation::EQ: + if (timestamp >= epoch_start_tmp && timestamp <= epoch_end_tmp) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::False; + } + case FilterOperation::NEQ: + if (timestamp >= epoch_start_tmp && timestamp <= epoch_end_tmp) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::True; + } + case FilterOperation::LT: + if (timestamp > epoch_end_tmp) { + return EvaluatedValue::True; + } else if (timestamp <= epoch_start_tmp) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::LTE: + if (timestamp >= epoch_end_tmp) { + return EvaluatedValue::True; + } else if (timestamp < epoch_start_tmp) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GT: + if (timestamp < epoch_start_tmp) { + return EvaluatedValue::True; + } else if (timestamp >= epoch_end_tmp) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GTE: + if (timestamp <= epoch_start_tmp) { + return EvaluatedValue::True; + } else if (timestamp > epoch_end_tmp) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + default: + return EvaluatedValue::Unknown; + } + } else { + return EvaluatedValue::Unknown; + } +} + +EvaluatedValue TimestampEntry::evaluate_filter(FilterOperation op, epochtime_t timestamp) { + if (op == FilterOperation::EXISTS || op == FilterOperation::NEXISTS) { + return EvaluatedValue::Unknown; + } + + if (m_encoding == DoubleEpoch) { + /** + * TODO: this borrows logic from the double_as_int function + * should + */ + epochtime_t epoch_start_tmp_ltgte = std::ceil(m_epoch_start_double); + epochtime_t epoch_start_tmp_gtlte = std::floor(m_epoch_start_double); + epochtime_t epoch_end_tmp_ltgte = std::ceil(m_epoch_end_double); + epochtime_t epoch_end_tmp_gtlte = std::floor(m_epoch_end_double); + switch (op) { + case FilterOperation::EQ: + if (timestamp >= epoch_start_tmp_ltgte && timestamp <= epoch_end_tmp_gtlte) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::False; + } + case FilterOperation::NEQ: + if (timestamp >= epoch_start_tmp_ltgte && timestamp <= epoch_end_tmp_gtlte) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::True; + } + case FilterOperation::LT: + if (timestamp > epoch_end_tmp_gtlte) { + return EvaluatedValue::True; + } else if (timestamp <= epoch_start_tmp_gtlte) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::LTE: + if (timestamp >= epoch_end_tmp_ltgte) { + return EvaluatedValue::True; + } else if (timestamp < epoch_start_tmp_ltgte) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GT: + if (timestamp < epoch_start_tmp_ltgte) { + return EvaluatedValue::True; + } else if (timestamp >= epoch_end_tmp_ltgte) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GTE: + if (timestamp <= epoch_start_tmp_gtlte) { + return EvaluatedValue::True; + } else if (timestamp > epoch_end_tmp_gtlte) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + default: + return EvaluatedValue::Unknown; + } + } else if (m_encoding == Epoch) { + switch (op) { + case FilterOperation::EQ: + if (timestamp >= m_epoch_start && timestamp <= m_epoch_end) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::False; + } + case FilterOperation::NEQ: + if (timestamp >= m_epoch_start && timestamp <= m_epoch_end) { + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::True; + } + case FilterOperation::LT: + if (timestamp > m_epoch_end) { + return EvaluatedValue::True; + } else if (timestamp <= m_epoch_start) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::LTE: + if (timestamp >= m_epoch_end) { + return EvaluatedValue::True; + } else if (timestamp < m_epoch_start) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GT: + if (timestamp < m_epoch_start) { + return EvaluatedValue::True; + } else if (timestamp >= m_epoch_end) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + case FilterOperation::GTE: + if (timestamp <= m_epoch_start) { + return EvaluatedValue::True; + } else if (timestamp > m_epoch_end) { + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + default: + return EvaluatedValue::Unknown; + } + } else { + return EvaluatedValue::Unknown; + } +} +} // namespace clp_s diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp new file mode 100644 index 000000000..1493173ba --- /dev/null +++ b/components/core/src/clp_s/TimestampEntry.hpp @@ -0,0 +1,101 @@ +#ifndef CLP_S_TIMESTAMPENTRY_HPP +#define CLP_S_TIMESTAMPENTRY_HPP + +#include +#include + +#include "Defs.hpp" +#include "ErrorCode.hpp" +#include "search/FilterOperation.hpp" +#include "Utils.hpp" +#include "ZstdCompressor.hpp" +#include "ZstdDecompressor.hpp" + +using clp_s::search::FilterOperation; + +namespace clp_s { +class TimestampEntry { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + char const* what() const noexcept override { return "TimestampEntry operation failed"; } + }; + + // Constants + enum TimestampEncoding : uint64_t { + UnkownTimestampEncoding, + Epoch, + DoubleEpoch + }; + + // Constructors + TimestampEntry() + : m_encoding(UnkownTimestampEncoding), + m_epoch_start_double(cDoubleEpochTimeMax), + m_epoch_end_double(cDoubleEpochTimeMin), + m_epoch_start(cEpochTimeMax), + m_epoch_end(cEpochTimeMin) {} + + /** + * Ingest a timestamp potentially adjusting the start and end bounds for this + * TimestampEntry. + * + * @param timestamp the timestamp to be ingested + * @return the epoch time corresponding to the string timestamp + */ + void ingest_timestamp(epochtime_t timestamp); + void ingest_timestamp(double timestamp); + + /** + * Merge a timestamp range potentially adjusting the start and end bounds for this + * + * @param timestamp the timestamp to be ingested + * @return the epoch time corresponding to the string timestamp + */ + void merge_range(TimestampEntry const& entry); + + /** + * Write the timestamp entry to a file + * @param compressor + * @param column + */ + void write_to_file(ZstdCompressor& compressor, std::string const& column) const; + + /** + * Try to read the timestamp entry from a file + * @param decompressor + * @param column + * @return ErrorCode + */ + ErrorCode try_read_from_file(ZstdDecompressor& decompressor, std::string& column); + + /** + * Read the timestamp entry from a file + * @param decompressor + * @param column + */ + void read_from_file(ZstdDecompressor& decompressor, std::string& column); + + /** + * Check if a timestamp is in the range of this TimestampEntry + * @param op + * @param timestamp + * @return + */ + EvaluatedValue evaluate_filter(FilterOperation op, double timestamp); + EvaluatedValue evaluate_filter(FilterOperation op, epochtime_t timestamp); + +private: + TimestampEncoding m_encoding; + double m_epoch_start_double, m_epoch_end_double; + epochtime_t m_epoch_start, m_epoch_end; +}; +} // namespace clp_s + +#endif // CLP_S_TIMESTAMPENTRY_HPP diff --git a/components/core/src/clp_s/TimestampPattern.cpp b/components/core/src/clp_s/TimestampPattern.cpp new file mode 100644 index 000000000..9457d5cda --- /dev/null +++ b/components/core/src/clp_s/TimestampPattern.cpp @@ -0,0 +1,1008 @@ +// Code from CLP + +#include "TimestampPattern.hpp" + +#include +#include +#include + +#include +#include + +using std::string; +using std::to_string; +using std::vector; + +namespace clp_s { +// Static member default initialization +std::unique_ptr TimestampPattern::m_known_ts_patterns = nullptr; +size_t TimestampPattern::m_known_ts_patterns_len = 0; + +// File-scope constants +static constexpr int cNumDaysInWeek = 7; +static char const* cAbbrevDaysOfWeek[cNumDaysInWeek] + = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; +static constexpr int cNumMonths = 12; +static char const* cAbbrevMonthNames[cNumMonths] + = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; +static char const* cMonthNames[cNumMonths] + = {"January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December"}; + +// File-scope functions +/** + * Converts a value to a padded string with the given length and appends it to the given string + * @param value + * @param padding_character + * @param length + * @param str + */ +static void append_padded_value(int value, char padding_character, size_t length, string& str); +/** + * Converts a value to a padded string with the given length and appends it to the given string. + * Omits trailing 0. + * @param value + * @param padding_character + * @param length + * @param str + */ +static void +append_padded_value_notz(int value, char padding_character, size_t max_length, string& str); + +/** + * Converts a padded decimal integer string (from a larger string) to an integer + * @param str String containing the numeric string + * @param begin_ix Start position of the numeric string + * @param end_ix End position of the numeric string + * @param padding_character + * @param value String as a number + * @return true if conversion succeeds, false otherwise + */ +static bool convert_string_to_number( + string const& str, + size_t begin_ix, + size_t end_ix, + char padding_character, + int& value +); + +/** + * Converts a padded decimal integer string with no trailing zeros (from a larger string) to an + * integer + * @param str String containing the numeric string + * @param max_digits + * @param begin_ix Start position of the numeric string + * @param end_ix Potentil end position of the numeric string + * @param value String as a number + * @return true if conversion succeeds, false otherwise + */ +static bool convert_string_to_number_notz( + string const& str, + size_t max_digits, + size_t begin_ix, + size_t& end_ix, + char padding_character, + int& value +); + +static void append_padded_value(int value, char padding_character, size_t length, string& str) { + string value_str = to_string(value); + str.append(length - value_str.length(), padding_character); + str += value_str; +} + +static void +append_padded_value_notz(int value, char padding_character, size_t max_length, string& str) { + string value_str = to_string(value); + if ("0" != value_str) { + str.append(max_length - value_str.length(), padding_character); + size_t last_zero = string::npos; + for (size_t last = value_str.size() - 1; last >= 0; --last) { + if (value_str[last] == '0') { + last_zero = last; + } else { + break; + } + } + + if (last_zero != string::npos) { + value_str.erase(last_zero, string::npos); + } + } + + str += value_str; +} + +static bool convert_string_to_number( + string const& str, + size_t begin_ix, + size_t end_ix, + char padding_character, + int& value +) { + // Consume padding characters + size_t ix = begin_ix; + while (ix < end_ix && padding_character == str[ix]) { + ++ix; + } + + // Convert remaining characters to number + int converted_value = 0; + for (; ix < end_ix; ++ix) { + char c = str[ix]; + if (c < '0' || c > '9') { + return false; + } + + converted_value *= 10; + converted_value += c - '0'; + } + + value = converted_value; + return true; +} + +static bool convert_string_to_number_notz( + string const& str, + size_t max_digits, + size_t begin_ix, + size_t& end_ix, + int& value +) { + value = 0; + size_t num_digits = 0; + + bool trailing_zero = false; + size_t ix = begin_ix; + while (ix < end_ix && '0' == str[ix]) { + trailing_zero = true; + num_digits++; + ++ix; + } + + // Convert remaining characters to number + for (; ix < end_ix; ++ix) { + char c = str[ix]; + if (c < '0' || c > '9') { + break; + } else if ('0' == c) { + trailing_zero = true; + } else { + trailing_zero = false; + } + value *= 10; + value += c - '0'; + num_digits++; + } + + if (trailing_zero && num_digits > 1) { + return false; + } + + end_ix = begin_ix + num_digits; + + for (int i = 0; i < (max_digits - num_digits); ++i) { + value *= 10; + } + + return true; +} + +/* + * To initialize m_known_ts_patterns, we first create a vector of patterns then copy it to a + * dynamic array. This eases maintenance of the list and the cost doesn't matter since it is + * only done once when the program starts. + */ +void TimestampPattern::init() { + // First create vector of observed patterns so that it's easy to maintain + vector patterns; + // E.g. 2022-04-06T03:33:23.476Z ...47, ...4 ...() + patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S.%TZ"); + // E.g. 2022-04-06T03:33:23Z + patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%SZ"); + // E.g. 2015-01-31T15:50:45.392 + patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S.%3"); + // E.g. 2015-01-31T15:50:45,392 + patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S,%3"); + // E.g. [2015-01-31T15:50:45 + patterns.emplace_back(0, "[%Y-%m-%dT%H:%M:%S"); + // E.g. [20170106-16:56:41] + patterns.emplace_back(0, "[%Y%m%d-%H:%M:%S]"); + // E.g. 2015-01-31 15:50:45,392 + patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S,%3"); + // E.g. 2015-01-31 15:50:45.392 + patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S.%3"); + // E.g. [2015-01-31 15:50:45,085] + patterns.emplace_back(0, "[%Y-%m-%d %H:%M:%S,%3]"); + // E.g. 2015-01-31 15:50:45 + patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S"); + // E.g. Start-Date: 2015-01-31 15:50:45 + patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); + // E.g. 2015/01/31 15:50:45 + patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S"); + // E.g. 15/01/31 15:50:45 + patterns.emplace_back(0, "%y/%m/%d %H:%M:%S"); + // E.g. 150131 9:50:45 + patterns.emplace_back(0, "%y%m%d %k:%M:%S"); + // E.g. 01 Jan 2016 15:50:17,085 + patterns.emplace_back(0, "%d %b %Y %H:%M:%S,%3"); + // E.g. Jan 01, 2016 3:50:17 PM + patterns.emplace_back(0, "%b %d, %Y %l:%M:%S %p"); + // E.g. January 31, 2015 15:50 + patterns.emplace_back(0, "%B %d, %Y %H:%M"); + // E.g. E [31/Jan/2015:15:50:45 + patterns.emplace_back(1, "[%d/%b/%Y:%H:%M:%S"); + // E.g. localhost - - [01/Jan/2016:15:50:17 + // E.g. 192.168.4.5 - - [01/Jan/2016:15:50:17 + patterns.emplace_back(3, "[%d/%b/%Y:%H:%M:%S"); + // E.g. 192.168.4.5 - - [01/01/2016:15:50:17 + patterns.emplace_back(3, "[%d/%m/%Y:%H:%M:%S"); + // E.g. INFO [main] 2015-01-31 15:50:45,085 + patterns.emplace_back(2, "%Y-%m-%d %H:%M:%S,%3"); + // E.g. Started POST "/api/v3/internal/allowed" for 127.0.0.1 at 2017-06-18 00:20:44 + patterns.emplace_back(6, "%Y-%m-%d %H:%M:%S"); + // E.g. update-alternatives 2015-01-31 15:50:45 + patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); + // E.g. ERROR: apport (pid 4557) Sun Jan 1 15:50:45 2015 + patterns.emplace_back(4, "%a %b %e %H:%M:%S %Y"); + // E.g. <<<2016-11-10 03:02:29:936 + patterns.emplace_back(0, "<<<%Y-%m-%d %H:%M:%S:%3"); + + // TODO These patterns are imprecise and will prevent searching by timestamp; but for now, + // it's no worse than not parsing a timestamp E.g. Jan 21 11:56:42 + patterns.emplace_back(0, "%b %d %H:%M:%S"); + // E.g. 01-21 11:56:42.392 + patterns.emplace_back(0, "%m-%d %H:%M:%S.%3"); + + // Initialize m_known_ts_patterns with vector's contents + m_known_ts_patterns_len = patterns.size(); + m_known_ts_patterns = std::make_unique(m_known_ts_patterns_len); + for (size_t i = 0; i < patterns.size(); ++i) { + m_known_ts_patterns[i] = patterns[i]; + } +} + +TimestampPattern const* TimestampPattern::search_known_ts_patterns( + string const& line, + epochtime_t& timestamp, + size_t& timestamp_begin_pos, + size_t& timestamp_end_pos +) { + for (size_t i = 0; i < m_known_ts_patterns_len; ++i) { + if (m_known_ts_patterns[i] + .parse_timestamp(line, timestamp, timestamp_begin_pos, timestamp_end_pos)) + { + return &m_known_ts_patterns[i]; + } + } + + timestamp_begin_pos = string::npos; + timestamp_end_pos = string::npos; + return nullptr; +} + +string const& TimestampPattern::get_format() const { + return m_format; +} + +uint8_t TimestampPattern::get_num_spaces_before_ts() const { + return m_num_spaces_before_ts; +} + +bool TimestampPattern::is_empty() const { + return m_format.empty(); +} + +void TimestampPattern::clear() { + m_num_spaces_before_ts = 0; + m_format.clear(); +} + +bool TimestampPattern::parse_timestamp( + string const& line, + epochtime_t& timestamp, + size_t& timestamp_begin_pos, + size_t& timestamp_end_pos +) const { + size_t line_ix = 0; + size_t const line_length = line.length(); + + // Find beginning of timestamp + int num_spaces_found; + for (num_spaces_found = 0; num_spaces_found < m_num_spaces_before_ts && line_ix < line_length; + ++line_ix) + { + if (' ' == line[line_ix]) { + ++num_spaces_found; + } + } + if (num_spaces_found < m_num_spaces_before_ts) { + return false; + } + size_t ts_begin_ix = line_ix; + + int date = 1; + int month = 1; + int year = 1970; + int hour = 0; + bool uses_12_hour_clock = false; + int minute = 0; + int second = 0; + int millisecond = 0; + bool is_pm = false; + + size_t const format_length = m_format.length(); + size_t format_ix = 0; + bool is_specifier = false; + for (; format_ix < format_length && line_ix < line_length; ++format_ix) { + if (false == is_specifier) { + if ('%' == m_format[format_ix]) { + is_specifier = true; + } else { + if (m_format[format_ix] != line[line_ix]) { + // Doesn't match + return false; + } + ++line_ix; + } + } else { + // Parse fields + switch (m_format[format_ix]) { + case '%': + if ('%' != line[line_ix]) { + return false; + } + ++line_ix; + break; + + case 'y': { // Zero-padded year in century + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 99) + { + return false; + } + year = value; + // Year >= 69 treated as 1900s, year below 69 treated as 2000s + if (year >= 69) { + year += 1900; + } else { + year += 2000; + } + line_ix += cFieldLength; + + break; + } + + case 'Y': { // Zero-padded year with century + constexpr int cFieldLength = 4; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 9999) + { + return false; + } + year = value; + line_ix += cFieldLength; + + break; + } + + case 'B': { // Month name + bool match_found = false; + for (int month_ix = 0; !match_found && month_ix < cNumMonths; ++month_ix) { + size_t const length = strlen(cMonthNames[month_ix]); + if (0 == line.compare(line_ix, length, cMonthNames[month_ix])) { + month = month_ix + 1; + match_found = true; + line_ix += length; + } + } + if (false == match_found) { + return false; + } + + break; + } + + case 'b': { // Abbreviated month name + bool match_found = false; + for (int month_ix = 0; !match_found && month_ix < cNumMonths; ++month_ix) { + size_t const length = strlen(cAbbrevMonthNames[month_ix]); + if (0 == line.compare(line_ix, length, cAbbrevMonthNames[month_ix])) { + month = month_ix + 1; + match_found = true; + line_ix += length; + } + } + if (false == match_found) { + return false; + } + + break; + } + + case 'm': { // Zero-padded month + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 1 || value > 12) + { + return false; + } + month = value; + line_ix += cFieldLength; + + break; + } + + case 'd': { // Zero-padded day in month + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 1 || value > 31) + { + return false; + } + date = value; + line_ix += cFieldLength; + + break; + } + + case 'e': { // Space-padded day in month + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + ' ', + value + ) + || value < 1 || value > 31) + { + return false; + } + date = value; + line_ix += cFieldLength; + + break; + } + + case 'a': { // Abbreviated day of week + bool match_found = false; + for (int day_ix = 0; !match_found && day_ix < cNumDaysInWeek; ++day_ix) { + size_t const abbrev_length = strlen(cAbbrevDaysOfWeek[day_ix]); + if (0 == line.compare(line_ix, abbrev_length, cAbbrevDaysOfWeek[day_ix])) { + match_found = true; + line_ix += abbrev_length; + } + } + if (false == match_found) { + return false; + } + // Weekday is not useful in determining absolute timestamp, so we don't do + // anything with it + + break; + } + + case 'p': { // Part of day + if (0 == line.compare(line_ix, 2, "AM")) { + is_pm = false; + } else if (0 == line.compare(line_ix, 2, "PM")) { + is_pm = true; + } else { + return false; + } + line_ix += 2; + + break; + } + + case 'H': { // Zero-padded hour on 24-hour clock + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 23) + { + return false; + } + hour = value; + line_ix += cFieldLength; + + break; + } + + case 'k': { // Space-padded hour on 24-hour clock + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + ' ', + value + ) + || value < 0 || value > 23) + { + return false; + } + hour = value; + line_ix += cFieldLength; + + break; + } + + case 'I': { // Zero-padded hour on 12-hour clock + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 1 || value > 12) + { + return false; + } + hour = value; + uses_12_hour_clock = true; + line_ix += cFieldLength; + + break; + } + + case 'l': { // Space-padded hour on 12-hour clock + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + ' ', + value + ) + || value < 1 || value > 12) + { + return false; + } + hour = value; + uses_12_hour_clock = true; + line_ix += cFieldLength; + + break; + } + + case 'M': { // Zero-padded minute + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 59) + { + return false; + } + minute = value; + line_ix += cFieldLength; + + break; + } + + case 'S': { // Zero-padded second + constexpr int cFieldLength = 2; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 60) + { + return false; + } + second = value; + line_ix += cFieldLength; + + break; + } + + case '3': { // Zero-padded millisecond + constexpr int cFieldLength = 3; + if (line_ix + cFieldLength > line_length) { + // Too short + return false; + } + + int value; + if (false + == convert_string_to_number( + line, + line_ix, + line_ix + cFieldLength, + '0', + value + ) + || value < 0 || value > 999) + { + return false; + } + millisecond = value; + line_ix += cFieldLength; + + break; + } + + case 'T': { // Zero-padded millisecond no trailing zero + constexpr int cMaxFieldLength = 3; + + int value; + size_t new_line_ix = line_ix + cMaxFieldLength; + if (!convert_string_to_number_notz( + line, + cMaxFieldLength, + line_ix, + new_line_ix, + value + ) + || value < 0 || value > 999) + { + return false; + } + millisecond = value; + line_ix = new_line_ix; + + break; + } + + default: + return false; + } + is_specifier = false; + } + } + if (format_ix < format_length) { + // Complete format string not present in line + return false; + } + + // Process parsed fields + if (uses_12_hour_clock) { + if (12 == hour) { + // 12s require special handling + if (false == is_pm) { + // hour == 12AM which is 0 on 24-hour clock + hour = 0; + } + } else { + if (is_pm) { + // All PMs except 12 should be +12, e.g. 1PM becomes (1 + 12)PM + hour += 12; + } + } + } + + // Create complete date + auto year_month_date = date::year(year) / month / date; + if (false == year_month_date.ok()) { + return false; + } + // Convert complete timestamp into a time point with millisecond resolution + auto timestamp_point = date::sys_days(year_month_date) + std::chrono::hours(hour) + + std::chrono::minutes(minute) + std::chrono::seconds(second) + + std::chrono::milliseconds(millisecond); + // Get time point since epoch + auto unix_epoch_point = date::sys_days(date::year(1970) / 1 / 1); + // Get timestamp since epoch + auto duration_since_epoch = timestamp_point - unix_epoch_point; + // Convert to raw milliseconds + timestamp = duration_since_epoch.count(); + + timestamp_begin_pos = ts_begin_ix; + timestamp_end_pos = line_ix; + + return true; +} + +void TimestampPattern::insert_formatted_timestamp(epochtime_t timestamp, string& msg) const { + size_t msg_length = msg.length(); + + string new_msg; + // We add 50 as an estimate of the timestamp's length + new_msg.reserve(msg_length + 50); + + // Find where timestamp should go + size_t ts_begin_ix = 0; + int num_spaces_found; + for (num_spaces_found = 0; + num_spaces_found < m_num_spaces_before_ts && ts_begin_ix < msg_length; + ++ts_begin_ix) + { + if (' ' == msg[ts_begin_ix]) { + ++num_spaces_found; + } + } + if (num_spaces_found < m_num_spaces_before_ts) { + SPDLOG_ERROR( + "{} has {} spaces, but pattern has {}", + msg.c_str(), + num_spaces_found, + m_num_spaces_before_ts + ); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + + // Copy text before timestamp + new_msg.assign(msg, 0, ts_begin_ix); + + // Separate parts of timestamp + auto timestamp_point + = date::sys_days(date::year(1970) / 1 / 1) + std::chrono::milliseconds(timestamp); + auto timestamp_date = date::floor(timestamp_point); + int day_of_week_ix + = (date::year_month_weekday(timestamp_date).weekday_indexed().weekday() - date::Sunday) + .count(); + auto year_month_date = date::year_month_day(timestamp_date); + unsigned date = (unsigned)year_month_date.day(); + unsigned month = (unsigned)year_month_date.month(); + int year = (int)year_month_date.year(); + + auto time_of_day_duration = timestamp_point - timestamp_date; + auto time_of_day = date::make_time(time_of_day_duration); + int hour = time_of_day.hours().count(); + int minute = time_of_day.minutes().count(); + int second = time_of_day.seconds().count(); + int millisecond = time_of_day.subseconds().count(); + + size_t const format_length = m_format.length(); + bool is_specifier = false; + for (size_t format_ix = 0; format_ix < format_length; ++format_ix) { + if (false == is_specifier) { + if ('%' == m_format[format_ix]) { + is_specifier = true; + } else { + new_msg += m_format[format_ix]; + } + } else { + // Parse fields + switch (m_format[format_ix]) { + case '%': + new_msg += m_format[format_ix]; + break; + + case 'y': { // Zero-padded year in century + int value = year; + if (year >= 2000) { + // year must be in range [2000,2068] + value -= 2000; + } else { + // year must be in range [1969,1999] + value -= 1900; + } + append_padded_value(value, '0', 2, new_msg); + break; + } + + case 'Y': // Zero-padded year with century + append_padded_value(year, '0', 4, new_msg); + break; + + case 'B': // Month name + new_msg += cMonthNames[month - 1]; + break; + + case 'b': // Abbreviated month name + new_msg += cAbbrevMonthNames[month - 1]; + break; + + case 'm': // Zero-padded month + append_padded_value(month, '0', 2, new_msg); + break; + + case 'd': // Zero-padded day in month + append_padded_value(date, '0', 2, new_msg); + break; + + case 'e': // Space-padded day in month + append_padded_value(date, ' ', 2, new_msg); + break; + + case 'a': // Abbreviated day of week + new_msg += cAbbrevDaysOfWeek[day_of_week_ix]; + break; + + case 'p': { // Part of day + if (hour > 11) { + new_msg += "PM"; + } else { + new_msg += "AM"; + } + break; + } + + case 'H': // Zero-padded hour on 24-hour clock + append_padded_value(hour, '0', 2, new_msg); + break; + + case 'k': // Space-padded hour on 24-hour clock + append_padded_value(hour, ' ', 2, new_msg); + break; + + case 'I': { // Zero-padded hour on 12-hour clock + int value = hour; + if (0 == value) { + value = 12; + } else if (value > 13) { + value -= 12; + } + append_padded_value(value, '0', 2, new_msg); + break; + } + + case 'l': { // Space-padded hour on 12-hour clock + int value = hour; + if (0 == value) { + value = 12; + } else if (value > 13) { + value -= 12; + } + append_padded_value(value, ' ', 2, new_msg); + break; + } + + case 'M': // Zero-padded minute + append_padded_value(minute, '0', 2, new_msg); + break; + + case 'S': // Zero-padded second + append_padded_value(second, '0', 2, new_msg); + break; + + case '3': // Zero-padded millisecond + append_padded_value(millisecond, '0', 3, new_msg); + break; + + case 'T': // Zero-padded millisecond no trailing 0 + append_padded_value_notz(millisecond, '0', 3, new_msg); + break; + + default: { + throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__); + } + } + is_specifier = false; + } + } + + // Copy text after timestamp + new_msg.append(msg, ts_begin_ix, string::npos); + + msg = new_msg; +} + +bool operator==(TimestampPattern const& lhs, TimestampPattern const& rhs) { + return (lhs.m_num_spaces_before_ts == rhs.m_num_spaces_before_ts && lhs.m_format == rhs.m_format + ); +} + +bool operator!=(TimestampPattern const& lhs, TimestampPattern const& rhs) { + return !(lhs == rhs); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/TimestampPattern.hpp b/components/core/src/clp_s/TimestampPattern.hpp new file mode 100644 index 000000000..f500df868 --- /dev/null +++ b/components/core/src/clp_s/TimestampPattern.hpp @@ -0,0 +1,166 @@ +// Code from CLP + +#ifndef CLP_S_TIMESTAMPPATTERN_HPP +#define CLP_S_TIMESTAMPPATTERN_HPP + +#include +#include +#include +#include + +#include "Defs.hpp" +#include "FileWriter.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +/** + * Class representing a timestamp pattern with methods for both parsing and formatting timestamps + * using the pattern. A format string contains directives specifying how a string should be parsed + * into a timestamp or how a timestamp should be formatted into a string. E.g., "[%H:%M:%S]" can + * parse from or format to "[23:45:19]" + * + * The supported directives are the same as strptime except that we require an exact number of + * spaces/padding digits so that we can reproduce the timestamp exactly. There are also additions + * beyond what strptime provides. + * + * The following directives are supported: + * - % Literal % + * - y 2-digit 0-padded year in century. [69,99] refers to years [1969,1999]. [00,68] refers to + * years [2000,2068]. + * - Y 4-digit 0-padded year including century (0000-9999) + * - B Full month name (e.g., "January") + * - b Abbreviated month name (e.g., "Jan") + * - m 2-digit 0-padded month (01-12) + * - d 2-digit 0-padded day in month (01-31) + * - e 2-character space-padded day in month ( 1-31) + * - a Abbreviated day of week (e.g., "Mon") + * - p Part of day (AM/PM) + * - H 2-digit 0-padded hour on 24-hour clock (00-23) + * - k 2-character space-padded hour on 24-hour clock ( 0-23) + * - I 2-digit 0-padded hour on 12-hour clock (01-12) + * - l 2-character space-padded hour on 12-hour clock ( 1-12) + * - M 2-digit 0-padded minute (00-59) + * - S 2-digit 0-padded second (00-60) (60 to account for leap seconds) + * - 3 0-padded millisecond (000-999) + * - T 0-padded millisecond no trailing 0 (000)-999) e.g. (000), 9(00), 99(0), 099 + */ +class TimestampPattern { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + char const* what() const noexcept override { return "TimestampPattern operation failed"; } + }; + + // Constructors + TimestampPattern() : m_num_spaces_before_ts(0) {} + + TimestampPattern(uint8_t num_spaces_before_ts, std::string format) + : m_num_spaces_before_ts(num_spaces_before_ts), + m_format(std::move(format)) {} + + // Methods + /** + * Static initializer for class. This must be called before using the class. + */ + static void init(); + + /** + * Searches for a known timestamp pattern which can parse the timestamp from the given line, and + * if found, parses the timestamp + * @param line + * @param timestamp Parsed timestamp + * @param timestamp_begin_pos + * @param timestamp_end_pos + * @return pointer to the timestamp pattern if found, nullptr otherwise + */ + static TimestampPattern const* search_known_ts_patterns( + std::string const& line, + epochtime_t& timestamp, + size_t& timestamp_begin_pos, + size_t& timestamp_end_pos + ); + + /** + * Gets the timestamp pattern's format string + * @return See description + */ + std::string const& get_format() const; + + /** + * Gets the number of spaces before the timestamp in a typical message + * @return See description + */ + uint8_t get_num_spaces_before_ts() const; + + /** + * Gets if the timestamp pattern is empty + * @return true if empty, false otherwise + */ + bool is_empty() const; + + /** + * Clears the pattern + */ + void clear(); + + /** + * Tries to parse the timestamp from the given line + * @param line + * @param timestamp Parsed timestamp + * @param timestamp_begin_pos + * @param timestamp_end_pos + * @return true if parsed successfully, false otherwise + */ + bool parse_timestamp( + std::string const& line, + epochtime_t& timestamp, + size_t& timestamp_begin_pos, + size_t& timestamp_end_pos + ) const; + + /** + * Inserts the timestamp into the given message using this pattern + * @param timestamp + * @param msg + * @throw TimestampPattern::OperationFailed if the the pattern contains unsupported format + * specifiers or the message cannot fit the timestamp pattern + */ + + void insert_formatted_timestamp(epochtime_t timestamp, std::string& msg) const; + + /** + * Compares two timestamp patterns for equality + * @param lhs + * @param rhs + * @return true if equal, false otherwise + */ + friend bool operator==(TimestampPattern const& lhs, TimestampPattern const& rhs); + + /** + * Compares two timestamp patterns for inequality + * @param lhs + * @param rhs + * @return true if not equal, false otherwise + */ + friend bool operator!=(TimestampPattern const& lhs, TimestampPattern const& rhs); + +private: + // Variables + static std::unique_ptr m_known_ts_patterns; + static size_t m_known_ts_patterns_len; + + // The number of spaces before the timestamp in a message + // E.g. in "localhost - - [01/Jan/2016:15:50:17", there are 3 spaces before the timestamp + // ^ ^ ^ + uint8_t m_num_spaces_before_ts; + std::string m_format; +}; +} // namespace clp_s + +#endif // CLP_S_TIMESTAMPPATTERN_HPP diff --git a/components/core/src/clp_s/TraceableException.hpp b/components/core/src/clp_s/TraceableException.hpp new file mode 100644 index 000000000..e64ffb617 --- /dev/null +++ b/components/core/src/clp_s/TraceableException.hpp @@ -0,0 +1,49 @@ +// Code from CLP + +#ifndef CLP_S_TRACEABLEEXCEPTION_HPP +#define CLP_S_TRACEABLEEXCEPTION_HPP + +#include +#include + +#include "ErrorCode.hpp" + +#define __FILENAME__ ((__FILE__) + SOURCE_PATH_SIZE) + +namespace clp_s { +class TraceableException : public std::exception { +public: + // Constructors + TraceableException(ErrorCode error_code, char const* const filename, int const line_number) + : m_error_code(error_code), + m_filename(filename), + m_line_number(line_number) { + m_message += std::string(m_filename) + ":" + std::to_string(m_line_number) + + " Error code: " + std::to_string(m_error_code) + "\n"; + } + + // Copy constructor / assignment operators + TraceableException(TraceableException const&) = default; + TraceableException& operator=(TraceableException const&) = default; + + // Methods + ErrorCode get_error_code() const { return m_error_code; } + + char const* get_filename() const { return m_filename; } + + int get_line_number() const { return m_line_number; } + + char const* what() const noexcept override { return m_message.c_str(); } + +protected: + std::string m_message; + +private: + // Variables + ErrorCode m_error_code; + char const* m_filename; + int m_line_number; +}; +} // namespace clp_s + +#endif // CLP_S_TRACEABLEEXCEPTION_HPP diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp new file mode 100644 index 000000000..cf59f3edb --- /dev/null +++ b/components/core/src/clp_s/Utils.cpp @@ -0,0 +1,431 @@ +#include "Utils.hpp" + +#include + +using std::string; +using std::string_view; + +namespace clp_s { +bool FileUtils::find_all_files(std::string const& path, std::vector& file_paths) { + try { + if (false == boost::filesystem::is_directory(path)) { + // path is a file + file_paths.push_back(path); + return true; + } + + if (boost::filesystem::is_empty(path)) { + // path is an empty directory + return true; + } + + // Iterate directory + boost::filesystem::recursive_directory_iterator iter( + path, + boost::filesystem::symlink_option::recurse + ); + boost::filesystem::recursive_directory_iterator end; + for (; iter != end; ++iter) { + // Check if current entry is an empty directory or a file + if (boost::filesystem::is_directory(iter->path())) { + if (boost::filesystem::is_empty(iter->path())) { + iter.no_push(); + } + } else { + file_paths.push_back(iter->path().string()); + } + } + } catch (boost::filesystem::filesystem_error& exception) { + SPDLOG_ERROR( + "Failed to find files/directories at '{}' - {}.", + path.c_str(), + exception.what() + ); + return false; + } + + return true; +} + +bool FileUtils::validate_path(std::vector const& paths) { + bool all_paths_exist = true; + for (auto const& path : paths) { + if (false == boost::filesystem::exists(path)) { + SPDLOG_ERROR("'{}' does not exist.", path.c_str()); + all_paths_exist = false; + } + } + + return all_paths_exist; +} + +bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, size_t& end_pos) { + auto const msg_length = msg.length(); + if (end_pos >= msg_length) { + return false; + } + + while (true) { + begin_pos = end_pos; + // Find next non-delimiter + for (; begin_pos < msg_length; ++begin_pos) { + if (false == is_delim(msg[begin_pos])) { + break; + } + } + if (msg_length == begin_pos) { + // Early exit for performance + return false; + } + + bool contains_decimal_digit = false; + bool contains_alphabet = false; + + // Find next delimiter + end_pos = begin_pos; + for (; end_pos < msg_length; ++end_pos) { + char c = msg[end_pos]; + if (is_decimal_digit(c)) { + contains_decimal_digit = true; + } else if (is_alphabet(c)) { + contains_alphabet = true; + } else if (is_delim(c)) { + break; + } + } + + // Treat token as variable if: + // - it contains a decimal digit, or + // - it's directly preceded by an equals sign and contains an alphabet, or + // - it could be a multi-digit hex value + if (contains_decimal_digit + || (begin_pos > 0 && '=' == msg[begin_pos - 1] && contains_alphabet) + || could_be_multi_digit_hex_value(msg, begin_pos, end_pos)) + { + break; + } + } + + return (msg_length != begin_pos); +} + +size_t StringUtils::find_first_of( + string const& haystack, + char const* needles, + size_t search_start_pos, + size_t& needle_ix +) { + size_t haystack_length = haystack.length(); + size_t needles_length = strlen(needles); + for (size_t i = search_start_pos; i < haystack_length; ++i) { + for (needle_ix = 0; needle_ix < needles_length; ++needle_ix) { + if (haystack[i] == needles[needle_ix]) { + return i; + } + } + } + + return string::npos; +} + +string StringUtils::replace_characters( + char const* characters_to_escape, + char const* replacement_characters, + string const& value, + bool escape +) { + string new_value; + size_t search_start_pos = 0; + while (true) { + size_t replace_char_ix; + size_t char_to_replace_pos + = find_first_of(value, characters_to_escape, search_start_pos, replace_char_ix); + if (string::npos == char_to_replace_pos) { + new_value.append(value, search_start_pos, string::npos); + break; + } else { + new_value.append(value, search_start_pos, char_to_replace_pos - search_start_pos); + if (escape) { + new_value += "\\"; + } + new_value += replacement_characters[replace_char_ix]; + search_start_pos = char_to_replace_pos + 1; + } + } + return new_value; +} + +void StringUtils::to_lower(string& str) { + std::transform(str.cbegin(), str.cend(), str.begin(), [](unsigned char c) { + return std::tolower(c); + }); +} + +bool StringUtils::is_wildcard(char c) { + static constexpr char cWildcards[] = "?*"; + for (size_t i = 0; i < strlen(cWildcards); ++i) { + if (cWildcards[i] == c) { + return true; + } + } + return false; +} + +string StringUtils::clean_up_wildcard_search_string(string_view str) { + string cleaned_str; + + bool is_escaped = false; + auto str_end = str.cend(); + for (auto current = str.cbegin(); current != str_end;) { + auto c = *current; + if (is_escaped) { + is_escaped = false; + + if (is_wildcard(c) || '\\' == c) { + // Keep escaping if c is a wildcard character or an escape character + cleaned_str += '\\'; + } + cleaned_str += c; + ++current; + } else if ('*' == c) { + cleaned_str += c; + + // Skip over all '*' to find the next non-'*' + do { + ++current; + } while (current != str_end && '*' == *current); + } else { + if ('\\' == c) { + is_escaped = true; + } else { + cleaned_str += c; + } + ++current; + } + } + + return cleaned_str; +} + +bool StringUtils::advance_tame_to_next_match( + char const*& tame_current, + char const*& tame_bookmark, + char const* tame_end, + char const*& wild_current, + char const*& wild_bookmark +) { + auto w = *wild_current; + if ('?' != w) { + // No need to check for '*' since the caller ensures wild doesn't + // contain consecutive '*' + + // Handle escaped characters + if ('\\' == w) { + ++wild_current; + // This is safe without a bounds check since this the caller + // ensures there are no dangling escape characters + w = *wild_current; + } + + // Advance tame_current until it matches wild_current + while (true) { + if (tame_end == tame_current) { + // Wild group is longer than last group in tame, so + // can't match + // e.g. "*abc" doesn't match "zab" + return false; + } + auto t = *tame_current; + if (t == w) { + break; + } + ++tame_current; + } + } + + tame_bookmark = tame_current; + + return true; +} + +bool StringUtils::wildcard_match_unsafe( + string_view tame, + string_view wild, + bool case_sensitive_match +) { + if (case_sensitive_match) { + return wildcard_match_unsafe_case_sensitive(tame, wild); + } else { + // We convert to lowercase (rather than uppercase) anticipating that + // callers use lowercase more frequently, so little will need to change. + string lowercase_tame(tame); + to_lower(lowercase_tame); + string lowercase_wild(wild); + to_lower(lowercase_wild); + return wildcard_match_unsafe_case_sensitive(lowercase_tame, lowercase_wild); + } +} + +/** + * The algorithm basically works as follows: + * Given a wild string "*abc*def*ghi*", it can be broken into groups of + * characters delimited by one or more '*' characters. The goal of the + * algorithm is then to determine whether the tame string contains each of + * those groups in the same order. + * + * Thus, the algorithm: + * 1. searches for the start of one of these groups in wild, + * 2. searches for a group in tame starting with the same character, and then + * 3. checks if the two match. If not, the search repeats with the next group in + * tame. + */ +bool StringUtils::wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { + auto const tame_length = tame.length(); + auto const wild_length = wild.length(); + char const* tame_current = tame.data(); + char const* wild_current = wild.data(); + char const* tame_bookmark = nullptr; + char const* wild_bookmark = nullptr; + char const* tame_end = tame_current + tame_length; + char const* wild_end = wild_current + wild_length; + + // Handle wild or tame being empty + if (0 == wild_length) { + return 0 == tame_length; + } else { + if (0 == tame_length) { + return "*" == wild; + } + } + + char w; + char t; + bool is_escaped = false; + while (true) { + w = *wild_current; + if ('*' == w) { + ++wild_current; + if (wild_end == wild_current) { + // Trailing '*' means everything remaining in tame will match + return true; + } + + // Set wild and tame bookmarks + wild_bookmark = wild_current; + if (!advance_tame_to_next_match( + tame_current, + tame_bookmark, + tame_end, + wild_current, + wild_bookmark + )) + { + return false; + } + } else { + // Handle escaped characters + if ('\\' == w) { + is_escaped = true; + ++wild_current; + // This is safe without a bounds check since this the caller + // ensures there are no dangling escape characters + w = *wild_current; + } + + // Handle a mismatch + t = *tame_current; + if (false == ((false == is_escaped && '?' == w) || t == w)) { + if (nullptr == wild_bookmark) { + // No bookmark to return to + return false; + } + + wild_current = wild_bookmark; + tame_current = tame_bookmark + 1; + if (!advance_tame_to_next_match( + tame_current, + tame_bookmark, + tame_end, + wild_current, + wild_bookmark + )) + { + return false; + } + } + } + + ++tame_current; + ++wild_current; + + // Handle reaching the end of tame or wild + if (tame_end == tame_current) { + return (wild_end == wild_current + || ('*' == *wild_current && (wild_current + 1) == wild_end)); + } else { + if (wild_end == wild_current) { + if (nullptr == wild_bookmark) { + // No bookmark to return to + return false; + } else { + wild_current = wild_bookmark; + tame_current = tame_bookmark + 1; + if (!advance_tame_to_next_match( + tame_current, + tame_bookmark, + tame_end, + wild_current, + wild_bookmark + )) + { + return false; + } + } + } + } + } +} + +bool StringUtils::convert_string_to_int64(std::string_view raw, int64_t& converted) { + auto raw_end = raw.cend(); + auto result = std::from_chars(raw.cbegin(), raw_end, converted); + if (raw_end != result.ptr) { + return false; + } else { + return result.ec == std::errc(); + } +} + +bool StringUtils::convert_string_to_double(std::string const& raw, double& converted) { + if (raw.empty()) { + // Can't convert an empty string + return false; + } + + char const* c_str = raw.c_str(); + char* end_ptr; + // Reset errno so we can detect a new error + errno = 0; + double raw_as_double = strtod(c_str, &end_ptr); + if (ERANGE == errno || (end_ptr - c_str) < raw.length()) { + return false; + } + converted = raw_as_double; + return true; +} + +void StringUtils::tokenize_column_descriptor( + std::string const& descriptor, + std::vector& tokens +) { + // TODO: handle escaped . correctly + auto start = 0U; + auto end = descriptor.find('.'); + while (end != std::string::npos) { + tokens.push_back(descriptor.substr(start, end - start)); + start = end + 1; + end = descriptor.find('.', start); + } + tokens.push_back(descriptor.substr(start)); +} +} // namespace clp_s diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp new file mode 100644 index 000000000..1cc7a4a4d --- /dev/null +++ b/components/core/src/clp_s/Utils.hpp @@ -0,0 +1,273 @@ +#ifndef CLP_S_UTILS_HPP +#define CLP_S_UTILS_HPP + +#include +#include + +#include + +namespace clp_s { +class FileUtils { +public: + /** + * Find all files in a directory + * @param path + * @param file_paths + * @return true if successful, false otherwise + */ + static bool find_all_files(std::string const& path, std::vector& file_paths); + + /** + * Validate if all paths exist + * @param paths + * @return true if all paths exist, false otherwise + */ + static bool validate_path(std::vector const& paths); +}; + +class StringUtils { +public: + /** + * Checks if the given character is an alphabet + * @param c + * @return true if c is an alphabet, false otherwise + */ + static inline bool is_alphabet(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); + } + + /** + * Checks if character is a decimal (base-10) digit + * @param c + * @return true if c is a decimal digit, false otherwise + */ + static inline bool is_decimal_digit(char c) { return '0' <= c && c <= '9'; } + + /** + * Checks if character is a hexadecimal (base-16) digit + * @param c + * @return true if c is a hexadecimal digit, false otherwise + */ + static inline bool is_delim(char c) { + return !( + '+' == c || ('-' <= c && c <= '9') || ('A' <= c && c <= 'Z') || '\\' == c + || '_' == c || ('a' <= c && c <= 'z') + ); + } + + /** + * Checks if the string could be a hexadecimal value + * @param str + * @param begin_pos + * @param end_pos + * @return true if str could be a hexadecimal value, false otherwise + */ + static inline bool + could_be_multi_digit_hex_value(std::string const& str, size_t begin_pos, size_t end_pos) { + if (end_pos - begin_pos < 2) { + return false; + } + + for (size_t i = begin_pos; i < end_pos; ++i) { + auto c = str[i]; + if (false + == (('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) + { + return false; + } + } + + return true; + } + + /** + * Returns bounds of next variable in given string + * A variable is a token (word between two delimiters) that contains numbers or is directly + * preceded by an equals sign + * @param msg + * @param begin_pos Begin position of last variable, changes to begin position of next variable + * @param end_pos End position of last variable, changes to end position of next variable + * @return true if a variable was found, false otherwise + */ + static bool get_bounds_of_next_var(std::string const& msg, size_t& begin_pos, size_t& end_pos); + + /** + * Searches haystack starting at the given position for one of the given needles + * @param haystack + * @param needles + * @param search_start_pos + * @param needle_ix The index of the needle found + * @return The position of the match or string::npos if none + */ + static size_t find_first_of( + std::string const& haystack, + char const* needles, + size_t search_start_pos, + size_t& needle_ix + ); + + /** + * Replaces the given characters in the given value with the given replacements + * @param characters_to_escape + * @param replacement_characters + * @param value + * @param escape Whether to precede the replacement with a '\' (e.g., so that a + * line-feed character is output as "\n") + * @return The string with replacements + */ + static std::string replace_characters( + char const* characters_to_escape, + char const* replacement_characters, + std::string const& value, + bool escape + ); + + /** + * Converts a string to lowercase + * @param str + */ + static void to_lower(std::string& str); + + /** + * Cleans wildcard search string + *
    + *
  • Removes consecutive '*'
  • + *
  • Removes escaping from non-wildcard characters
  • + *
  • Removes dangling escape character from the end of the string
  • + *
+ * @param str Wildcard search string to clean + * @return Cleaned wildcard search string + */ + static std::string clean_up_wildcard_search_string(std::string_view str); + + /** + * Checks if character is a wildcard + * @param c + * @return true if c is a wildcard, false otherwise + */ + static bool is_wildcard(char c); + + /** + * Same as ``wildcard_match_unsafe_case_sensitive`` except this method + * allows the caller to specify whether the match should be case sensitive. + * + * @param tame The literal string + * @param wild The wildcard string + * @param case_sensitive_match Whether to consider case when matching + * @return Whether the two strings match + */ + static bool wildcard_match_unsafe( + std::string_view tame, + std::string_view wild, + bool case_sensitive_match = true + ); + + /** + * Checks if a string matches a wildcard string. Two wildcards are currently + * supported: '*' to match 0 or more characters, and '?' to match any single + * character. Each can be escaped using a preceding '\'. Other characters which + * are escaped are treated as normal characters. + *
+ * This method is optimized for performance by omitting some checks on the + * wildcard string that are unnecessary if the caller cleans up the wildcard + * string as follows: + *
    + *
  • The wildcard string should not contain consecutive '*'.
  • + *
  • The wildcard string should not contain an escape character without a + * character following it.
  • + *
+ * + * @param tame The literal string + * @param wild The wildcard string + * @return Whether the two strings match + */ + static bool wildcard_match_unsafe_case_sensitive(std::string_view tame, std::string_view wild); + + /** + * Converts the given string to a 64-bit integer if possible + * @param raw + * @param converted + * @return true if the conversion was successful, false otherwise + */ + static bool convert_string_to_int64(std::string_view raw, int64_t& converted); + + /** + * Converts the given string to a double if possible + * @param raw + * @param converted + * @return true if the conversion was successful, false otherwise + */ + static bool convert_string_to_double(std::string const& raw, double& converted); + + /** + * Converts a string column descriptor delimited by '.' into a list of tokens + * @param descriptor + * @param tokens + * @return the list of tokens pushed into the 'tokens' parameter + */ + static void + tokenize_column_descriptor(std::string const& descriptor, std::vector& tokens); + +private: + /** + * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the + * pointer in tame to the next character which matches wild. This method + * should be inlined for performance. + * @param tame_current + * @param tame_bookmark + * @param tame_end + * @param wild_current + * @param wild_bookmark + * @return true on success, false if wild cannot match tame + */ + static inline bool advance_tame_to_next_match( + char const*& tame_current, + char const*& tame_bookmark, + char const* tame_end, + char const*& wild_current, + char const*& wild_bookmark + ); +}; + +enum EvaluatedValue { + True, + False, + Unknown +}; + +template +inline T2 bit_cast(T1 t1) { + static_assert(sizeof(T1) == sizeof(T2), "Must match size"); + static_assert(std::is_standard_layout::value, "Need to be standard layout"); + static_assert(std::is_standard_layout::value, "Need to be standard layout"); + + T2 t2; + std::memcpy(std::addressof(t2), std::addressof(t1), sizeof(T1)); + return t2; +} + +/** + * A span of memory + * @tparam T + */ +template +class Span { +public: + Span() = default; + Span(T* begin, size_t size) : m_begin(begin), m_size(size){}; + + T* begin() { return m_begin; } + + T* end() { return m_begin + m_size; } + + size_t size() { return m_size; } + + T& operator[](size_t i) { return m_begin[i]; } + +private: + T* m_begin; + size_t m_size{}; +}; +} // namespace clp_s + +#endif // CLP_S_UTILS_HPP diff --git a/components/core/src/clp_s/VariableDecoder.cpp b/components/core/src/clp_s/VariableDecoder.cpp new file mode 100644 index 000000000..ff91a87bb --- /dev/null +++ b/components/core/src/clp_s/VariableDecoder.cpp @@ -0,0 +1,118 @@ +// Code from CLP + +#include "VariableDecoder.hpp" + +namespace clp_s { +bool VariableDecoder::decode_variables_into_message( + LogTypeDictionaryEntry const& logtype_dict_entry, + VariableDictionaryReader const& var_dict, + Span encoded_vars, + std::string& decompressed_msg +) { + size_t num_vars_in_logtype = logtype_dict_entry.get_num_vars(); + + // Ensure the number of variables in the logtype matches the number of encoded variables given + auto const& logtype_value = logtype_dict_entry.get_value(); + if (num_vars_in_logtype != encoded_vars.size()) { + SPDLOG_ERROR( + "VariableDecoder: Logtype '{}' contains {} variables, but {} were given for " + "decoding.", + logtype_value.c_str(), + num_vars_in_logtype, + encoded_vars.size() + ); + return false; + } + + LogTypeDictionaryEntry::VarDelim var_delim; + size_t constant_begin_pos = 0; + std::string double_str; + for (size_t i = 0; i < num_vars_in_logtype; ++i) { + size_t var_position = logtype_dict_entry.get_var_info(i, var_delim); + + // Add the constant that's between the last variable and this one + decompressed_msg + .append(logtype_value, constant_begin_pos, var_position - constant_begin_pos); + + if (LogTypeDictionaryEntry::VarDelim::NonDouble == var_delim) { + if (false == is_var_dict_id(encoded_vars[i])) { + decompressed_msg += std::to_string(encoded_vars[i]); + } else { + auto var_dict_id = decode_var_dict_id(encoded_vars[i]); + decompressed_msg += var_dict.get_value(var_dict_id); + } + } else { // LogTypeDictionaryEntry::VarDelim::Double == var_delim + convert_encoded_double_to_string(encoded_vars[i], double_str); + + decompressed_msg += double_str; + } + // Move past the variable delimiter + constant_begin_pos = var_position + 1; + } + // Append remainder of logtype, if any + if (constant_begin_pos < logtype_value.length()) { + decompressed_msg.append(logtype_value, constant_begin_pos, std::string::npos); + } + + return true; +} + +void VariableDecoder::convert_encoded_double_to_string(int64_t encoded_var, std::string& value) { + uint64_t encoded_double; + static_assert( + sizeof(encoded_double) == sizeof(encoded_var), + "sizeof(encoded_double) != sizeof(encoded_var)" + ); + // NOTE: We use memcpy rather than reinterpret_cast to avoid violating strict aliasing; a smart + // compiler should optimize it to a register move + std::memcpy(&encoded_double, &encoded_var, sizeof(encoded_var)); + + // Decode according to the format described in + // VariableDecoder::convert_string_to_representable_double_var + uint64_t digits = encoded_double & 0x003F'FFFF'FFFF'FFFF; + encoded_double >>= 55; + uint8_t decimal_pos = (encoded_double & 0x0F) + 1; + encoded_double >>= 4; + uint8_t num_digits = (encoded_double & 0x0F) + 1; + encoded_double >>= 4; + bool is_negative = encoded_double > 0; + + size_t value_length = num_digits + 1 + is_negative; + value.resize(value_length); + size_t num_chars_to_process = value_length; + + // Add sign + if (is_negative) { + value[0] = '-'; + --num_chars_to_process; + } + + // Decode until the decimal or the non-zero digits are exhausted + size_t pos = value_length - 1; + for (; pos > (value_length - 1 - decimal_pos) && digits > 0; --pos) { + value[pos] = (char)('0' + (digits % 10)); + digits /= 10; + --num_chars_to_process; + } + + if (digits > 0) { + // Skip decimal since it's added at the end + --pos; + --num_chars_to_process; + + while (digits > 0) { + value[pos--] = (char)('0' + (digits % 10)); + digits /= 10; + --num_chars_to_process; + } + } + + // Add remaining zeros + for (; num_chars_to_process > 0; --num_chars_to_process) { + value[pos--] = '0'; + } + + // Add decimal + value[value_length - 1 - decimal_pos] = '.'; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/VariableDecoder.hpp b/components/core/src/clp_s/VariableDecoder.hpp new file mode 100644 index 000000000..f99a08dad --- /dev/null +++ b/components/core/src/clp_s/VariableDecoder.hpp @@ -0,0 +1,61 @@ +// Code from CLP + +#ifndef CLP_S_VARIABLEDECODER_HPP +#define CLP_S_VARIABLEDECODER_HPP + +#include "DictionaryEntry.hpp" +#include "DictionaryReader.hpp" +#include "Utils.hpp" + +namespace clp_s { +class VariableDecoder { +public: + /** + * Decode variables into a message + * @param logtype_dict_entry + * @param var_dict + * @param encoded_var + * @param value + */ + static bool decode_variables_into_message( + LogTypeDictionaryEntry const& logtype_dict_entry, + VariableDictionaryReader const& var_dict, + Span encoded_vars, + std::string& decompressed_msg + ); + +private: + /** + * Convert an encoded double into a string + * @param logtype_dict_entry + * @param var_dict + * @param encoded_var + * @param value + */ + static void convert_encoded_double_to_string(int64_t encoded_var, std::string& value); + + /** + * Checks if the given encoded variable is a variable dictionary id + * @param encoded_var + * @return true if encoded_var is a variable dictionary id, false otherwise + */ + static bool is_var_dict_id(int64_t encoded_var) { + return (cVarDictIdRangeBegin <= encoded_var && encoded_var < cVarDictIdRangeEnd); + } + + /** + * Decodes the given variable dictionary id + * @param encoded_var + * @return the decoded id + */ + static uint64_t decode_var_dict_id(int64_t encoded_var) { + uint64_t id = encoded_var - cVarDictIdRangeBegin; + return id; + } + + static constexpr int64_t cVarDictIdRangeBegin = 1LL << 62; + static constexpr int64_t cVarDictIdRangeEnd = (1ULL << 63) - 1; +}; +} // namespace clp_s + +#endif // CLP_S_VARIABLEDECODER_HPP diff --git a/components/core/src/clp_s/VariableEncoder.cpp b/components/core/src/clp_s/VariableEncoder.cpp new file mode 100644 index 000000000..169b3da3a --- /dev/null +++ b/components/core/src/clp_s/VariableEncoder.cpp @@ -0,0 +1,184 @@ +// Code from CLP + +#include "VariableEncoder.hpp" + +namespace clp_s { +void VariableEncoder::encode_and_add_to_dictionary( + std::string const& message, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars +) { + // Extract all variables and add to dictionary while building logtype + size_t var_begin_pos = 0; + size_t var_end_pos = 0; + std::string var_str; + logtype_dict_entry.clear(); + // To avoid reallocating the logtype as we append to it, reserve enough space to hold the entire + // message + logtype_dict_entry.reserve_constant_length(message.length()); + while (logtype_dict_entry.parse_next_var(message, var_begin_pos, var_end_pos, var_str)) { + // Encode variable + int64_t encoded_var; + if (convert_string_to_representable_integer_var(var_str, encoded_var)) { + logtype_dict_entry.add_non_double_var(); + } else if (convert_string_to_representable_double_var(var_str, encoded_var)) { + logtype_dict_entry.add_double_var(); + } else { + // Variable string looks like a dictionary variable, so encode it as so + uint64_t id; + var_dict.add_entry(var_str, id); + encoded_var = encode_var_dict_id(id); + + logtype_dict_entry.add_non_double_var(); + } + + encoded_vars.push_back(encoded_var); + } +} + +bool VariableEncoder::convert_string_to_int64(std::string const& raw, int64_t& converted) { + if (raw.empty()) { + // Can't convert an empty string + return false; + } + + char const* c_str = raw.c_str(); + char* endptr; + // Reset errno so we can detect if it's been set + errno = 0; + int64_t raw_as_int = strtoll(c_str, &endptr, 10); + if (endptr - c_str != raw.length() || (LLONG_MAX == raw_as_int && ERANGE == errno)) { + // Conversion failed + return false; + } + converted = raw_as_int; + return true; +} + +bool VariableEncoder::convert_string_to_representable_integer_var( + std::string const& value, + int64_t& encoded_var +) { + size_t length = value.length(); + if (0 == length) { + // Empty string cannot be converted + return false; + } + + // Ensure start of value is an integer with no zero-padding or positive sign + if ('-' == value[0]) { + // Ensure first character after sign is a non-zero integer + if (length < 2 || value[1] < '1' || '9' < value[1]) { + return false; + } + } else { + // Ensure first character is a digit + if (value[0] < '0' || '9' < value[0]) { + return false; + } + + // Ensure value is not zero-padded + if (length > 1 && '0' == value[0]) { + return false; + } + } + + int64_t result; + // Conversion failed or value is in dictionary variable range, so cannot be converted + if (false == convert_string_to_int64(value, result) || result >= cVarDictIdRangeBegin) { + return false; + } else { + encoded_var = result; + } + + return true; +} + +bool VariableEncoder::convert_string_to_representable_double_var( + std::string const& value, + int64_t& encoded_var +) { + if (value.empty()) { + // Can't convert an empty string + return false; + } + + size_t pos = 0; + constexpr size_t cMaxDigitsInRepresentableDoubleVar = 16; + // +1 for decimal point + size_t max_length = cMaxDigitsInRepresentableDoubleVar + 1; + + // Check for a negative sign + bool is_negative = false; + if ('-' == value[pos]) { + is_negative = true; + ++pos; + // Include sign in max length + ++max_length; + } + + // Check if value can be represented in encoded format + if (value.length() > max_length) { + return false; + } + + size_t num_digits = 0; + size_t decimal_point_pos = std::string::npos; + uint64_t digits = 0; + for (; pos < value.length(); ++pos) { + auto c = value[pos]; + if ('0' <= c && c <= '9') { + digits *= 10; + digits += (c - '0'); + ++num_digits; + } else if (std::string::npos == decimal_point_pos && '.' == c) { + decimal_point_pos = value.length() - 1 - pos; + } else { + // Invalid character + return false; + } + } + if (std::string::npos == decimal_point_pos || 0 == decimal_point_pos || 0 == num_digits) { + // No decimal point found, decimal point is after all digits, or no digits found + return false; + } + + // Encode into 64 bits with the following format (from MSB to LSB): + // - 1 bit : is negative + // - 4 bits: # of decimal digits minus 1 + // - This format can represent doubles with between 1 and 16 decimal digits, so we use 4 + // bits and map the range [1, 16] to [0x0, 0xF] + // - 4 bits: position of the decimal from the right minus 1 + // - To see why the position is taken from the right, consider (1) "-123456789012345.6", (2) + // "-.1234567890123456", and (3) ".1234567890123456" + // - For (1), the decimal point is at index 16 from the left and index 1 from the right. + // - For (2), the decimal point is at index 1 from the left and index 16 from the right. + // - For (3), the decimal point is at index 0 from the left and index 16 from the right. + // - So if we take the decimal position from the left, it can range from 0 to 16 because + // of the negative sign. Whereas from the right, the + // negative sign is inconsequential. + // - Thus, we use 4 bits and map the range [1, 16] to [0x0, 0xF]. + // - 1 bit : unused + // - 54 bits: The digits of the double without the decimal, as an integer + uint64_t encoded_double = 0; + if (is_negative) { + encoded_double = 1; + } + encoded_double <<= 4; + encoded_double |= (num_digits - 1) & 0x0F; + encoded_double <<= 4; + encoded_double |= (decimal_point_pos - 1) & 0x0F; + encoded_double <<= 55; + encoded_double |= digits & 0x003F'FFFF'FFFF'FFFF; + static_assert( + sizeof(encoded_var) == sizeof(encoded_double), + "sizeof(encoded_var) != sizeof(encoded_double)" + ); + // NOTE: We use memcpy rather than reinterpret_cast to avoid violating strict aliasing; a smart + // compiler should optimize it to a register move + std::memcpy(&encoded_var, &encoded_double, sizeof(encoded_double)); + + return true; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/VariableEncoder.hpp b/components/core/src/clp_s/VariableEncoder.hpp new file mode 100644 index 000000000..d604b7c0d --- /dev/null +++ b/components/core/src/clp_s/VariableEncoder.hpp @@ -0,0 +1,71 @@ +// Code from CLP + +#ifndef CLP_S_VARIABLEENCODER_HPP +#define CLP_S_VARIABLEENCODER_HPP + +#include + +#include + +#include "DictionaryEntry.hpp" +#include "DictionaryWriter.hpp" + +using namespace simdjson; + +namespace clp_s { +class VariableEncoder { +public: + /** + * Encodes the given message and adds the encoded variables to the given vector + * @param message + * @param logtype_dict_entry + * @param var_dict + * @param encoded_vars + */ + static void encode_and_add_to_dictionary( + std::string const& message, + LogTypeDictionaryEntry& logtype_dict_entry, + VariableDictionaryWriter& var_dict, + std::vector& encoded_vars + ); + + /** + * Converts the given string to an int64_t + * @param raw + * @param converted + * @return true if the conversion was successful, false otherwise + */ + static bool convert_string_to_int64(std::string const& raw, int64_t& converted); + + /** + * Converts the given string to a representable int64_t + * @param value + * @param encoded_var + * @return true if the conversion was successful, false otherwise + */ + static bool + convert_string_to_representable_integer_var(std::string const& value, int64_t& encoded_var); + + /** + * Converts the given string to a representable encoded double + * @param value + * @param encoded_var + * @return true if the conversion was successful, false otherwise + */ + static bool + convert_string_to_representable_double_var(std::string const& value, int64_t& encoded_var); + + /** + * Encodes the given dictionary id as a variable dictionary id + * @param id + * @return the encoded id + */ + static int64_t encode_var_dict_id(uint64_t id) { return (int64_t)id + cVarDictIdRangeBegin; } + +private: + static constexpr int64_t cVarDictIdRangeBegin = 1LL << 62; + static constexpr int64_t cVarDictIdRangeEnd = (1ULL << 63) - 1; +}; +} // namespace clp_s + +#endif // CLP_S_VARIABLEENCODER_HPP diff --git a/components/core/src/clp_s/ZstdCompressor.cpp b/components/core/src/clp_s/ZstdCompressor.cpp new file mode 100644 index 000000000..8bfba6167 --- /dev/null +++ b/components/core/src/clp_s/ZstdCompressor.cpp @@ -0,0 +1,120 @@ +// Code from CLP + +#include "ZstdCompressor.hpp" + +namespace clp_s { +ZstdCompressor::ZstdCompressor() + : Compressor(CompressorType::ZSTD), + m_compression_stream_contains_data(false), + m_compressed_stream_file_writer(nullptr) { + m_compression_stream = ZSTD_createCStream(); + if (nullptr == m_compression_stream) { + SPDLOG_ERROR("ZstdCompressor: ZSTD_createCStream() error"); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } +} + +ZstdCompressor::~ZstdCompressor() { + ZSTD_freeCStream(m_compression_stream); +} + +void ZstdCompressor::open(FileWriter& file_writer, int const compression_level) { + if (nullptr != m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + + // Setup compressed stream parameters + size_t compressed_stream_block_size = ZSTD_CStreamOutSize(); + m_compressed_stream_block_buffer = std::make_unique(compressed_stream_block_size); + m_compressed_stream_block.dst = m_compressed_stream_block_buffer.get(); + m_compressed_stream_block.size = compressed_stream_block_size; + + // Setup compression stream + auto init_result = ZSTD_initCStream(m_compression_stream, compression_level); + if (ZSTD_isError(init_result)) { + SPDLOG_ERROR( + "ZstdCompressor: ZSTD_initCStream() error: {}", + ZSTD_getErrorName(init_result) + ); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + + m_compressed_stream_file_writer = &file_writer; + + m_uncompressed_stream_pos = 0; +} + +void ZstdCompressor::close() { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + flush(); + m_compressed_stream_file_writer = nullptr; +} + +void ZstdCompressor::write(char const* data, size_t data_length) { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + + if (0 == data_length) { + // Nothing needs to be done because we do not need to compress anything + return; + } + if (nullptr == data) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + ZSTD_inBuffer uncompressed_stream_block = {data, data_length, 0}; + while (uncompressed_stream_block.pos < uncompressed_stream_block.size) { + m_compressed_stream_block.pos = 0; + auto error = ZSTD_compressStream( + m_compression_stream, + &m_compressed_stream_block, + &uncompressed_stream_block + ); + if (ZSTD_isError(error)) { + SPDLOG_ERROR( + "ZstdCompressor: ZSTD_compressStream() error: {}", + ZSTD_getErrorName(error) + ); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + if (m_compressed_stream_block.pos) { + // Write to disk only if there is data in the compressed stream block buffer + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block.dst), + m_compressed_stream_block.pos + ); + } + } + + m_compression_stream_contains_data = true; + m_uncompressed_stream_pos += data_length; +} + +void ZstdCompressor::flush() { + if (false == m_compression_stream_contains_data) { + return; + } + + m_compressed_stream_block.pos = 0; + auto end_stream_result = ZSTD_endStream(m_compression_stream, &m_compressed_stream_block); + if (end_stream_result) { + // Note: Output buffer is large enough that it is guaranteed to have enough room to be able + // to Flush the entire buffer, so this can only be an error + SPDLOG_ERROR( + "ZstdCompressor: ZSTD_endStream() error: {}", + ZSTD_getErrorName(end_stream_result) + ); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block.dst), + m_compressed_stream_block.pos + ); + + m_compression_stream_contains_data = false; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ZstdCompressor.hpp b/components/core/src/clp_s/ZstdCompressor.hpp new file mode 100644 index 000000000..4104571c7 --- /dev/null +++ b/components/core/src/clp_s/ZstdCompressor.hpp @@ -0,0 +1,98 @@ +// Code from CLP + +#ifndef CLP_S_ZSTDCOMPRESSOR_HPP +#define CLP_S_ZSTDCOMPRESSOR_HPP + +#include +#include + +#include +#include +#include + +#include "Compressor.hpp" +#include "FileWriter.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +constexpr int cDefaultCompressionLevel = 3; + +class ZstdCompressor : public Compressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + ZstdCompressor(); + + // Destructor + ~ZstdCompressor() override; + + // Explicitly disable copy and move constructor/assignment + ZstdCompressor(ZstdCompressor const&) = delete; + + ZstdCompressor& operator=(ZstdCompressor const&) = delete; + + // Methods implementing the WriterInterface + /** + * Writes the given data to the compressor + * @param data + * @param data_length + */ + void write(char const* data, size_t data_length); + + /** + * Writes the given numeric value to the compressor + * @param val + * @tparam ValueType + */ + template + void write_numeric_value(ValueType val) { + write(reinterpret_cast(&val), sizeof(val)); + } + + /** + * Writes the given string to the compressor + * @param str + */ + void write_string(std::string const& str) { write(str.c_str(), str.length()); } + + /** + * Writes any internally buffered data to file and ends the current frame + */ + void flush(); + + // Methods implementing the Compressor interface + /** + * Closes the compressor + */ + void close() override; + + /** + * Initialize streaming compressor + * @param file_writer + * @param compression_level + */ + void open(FileWriter& file_writer, int compression_level = cDefaultCompressionLevel); + +private: + // Variables + FileWriter* m_compressed_stream_file_writer{}; + + // Compressed stream variables + ZSTD_CStream* m_compression_stream; + bool m_compression_stream_contains_data; + + ZSTD_outBuffer m_compressed_stream_block{}; + std::unique_ptr m_compressed_stream_block_buffer; + + size_t m_uncompressed_stream_pos{}; +}; +} // namespace clp_s + +#endif // CLP_S_ZSTDCOMPRESSOR_HPP diff --git a/components/core/src/clp_s/ZstdDecompressor.cpp b/components/core/src/clp_s/ZstdDecompressor.cpp new file mode 100644 index 000000000..ee1632732 --- /dev/null +++ b/components/core/src/clp_s/ZstdDecompressor.cpp @@ -0,0 +1,238 @@ +// Code from CLP + +#include "ZstdDecompressor.hpp" + +#include + +#include +#include + +namespace clp_s { +ZstdDecompressor::ZstdDecompressor() + : Decompressor(CompressorType::ZSTD), + m_input_type(InputType::NotInitialized), + m_decompression_stream(nullptr), + m_file_reader(nullptr), + m_file_reader_initial_pos(0), + m_file_read_buffer_length(0), + m_file_read_buffer_capacity(0), + m_decompressed_stream_pos(0), + m_unused_decompressed_stream_block_size(0) { + m_decompression_stream = ZSTD_createDStream(); + if (nullptr == m_decompression_stream) { + SPDLOG_ERROR("ZstdDecompressor: ZSTD_createDStream() error"); + throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); + } + + // Create block to hold unused decompressed data + m_unused_decompressed_stream_block_size = ZSTD_DStreamOutSize(); + m_unused_decompressed_stream_block_buffer + = std::make_unique(m_unused_decompressed_stream_block_size); +} + +ZstdDecompressor::~ZstdDecompressor() { + ZSTD_freeDStream(m_decompression_stream); +} + +ErrorCode +ZstdDecompressor::try_read(char const* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (InputType::NotInitialized == m_input_type) { + throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + } + if (nullptr == buf) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + num_bytes_read = 0; + + ZSTD_outBuffer decompressed_stream_block = {(void*)buf, num_bytes_to_read, 0}; + while (decompressed_stream_block.pos < num_bytes_to_read) { + // Check if there's data that can be decompressed + if (m_compressed_stream_block.pos == m_compressed_stream_block.size) { + switch (m_input_type) { + case InputType::CompressedDataBuf: + // Fall through + case InputType::MemoryMappedCompressedFile: + num_bytes_read = decompressed_stream_block.pos; + if (0 == decompressed_stream_block.pos) { + return ErrorCodeEndOfFile; + } else { + return ErrorCodeSuccess; + } + case InputType::File: { + auto error_code = m_file_reader->try_read( + reinterpret_cast(m_file_read_buffer.get()), + m_file_read_buffer_capacity, + m_file_read_buffer_length + ); + if (ErrorCodeSuccess != error_code) { + if (ErrorCodeEndOfFile == error_code) { + num_bytes_read = decompressed_stream_block.pos; + if (0 == decompressed_stream_block.pos) { + return ErrorCodeEndOfFile; + } else { + return ErrorCodeSuccess; + } + } else { + return error_code; + } + } + + m_compressed_stream_block.pos = 0; + m_compressed_stream_block.size = m_file_read_buffer_length; + break; + } + default: + throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__); + } + } + + // Decompress + size_t error = ZSTD_decompressStream( + m_decompression_stream, + &decompressed_stream_block, + &m_compressed_stream_block + ); + if (ZSTD_isError(error)) { + SPDLOG_ERROR( + "ZstdDecompressor: ZSTD_decompressStream() error: {}", + ZSTD_getErrorName(error) + ); + return ErrorCodeFailure; + } + } + + // Update decompression stream position + m_decompressed_stream_pos += decompressed_stream_block.pos; + + num_bytes_read = decompressed_stream_block.pos; + return ErrorCodeSuccess; +} + +ErrorCode ZstdDecompressor::try_read_string(size_t str_length, std::string& str) { + str.resize(str_length); + + return try_read_exact_length(&str[0], str_length); +} + +ErrorCode ZstdDecompressor::try_read_exact_length(char* buf, size_t num_bytes) { + size_t num_bytes_read; + auto error_code = try_read(buf, num_bytes, num_bytes_read); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + if (num_bytes_read < num_bytes) { + return ErrorCodeTruncated; + } + + return ErrorCodeSuccess; +} + +void ZstdDecompressor::open(char const* compressed_data_buf, size_t compressed_data_buf_size) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::CompressedDataBuf; + + m_compressed_stream_block = {compressed_data_buf, compressed_data_buf_size, 0}; + + reset_stream(); +} + +void ZstdDecompressor::open(FileReader& file_reader, size_t file_read_buffer_capacity) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::File; + + m_file_reader = &file_reader; + m_file_reader_initial_pos = m_file_reader->get_pos(); + + m_file_read_buffer_capacity = file_read_buffer_capacity; + m_file_read_buffer = std::make_unique(m_file_read_buffer_capacity); + m_file_read_buffer_length = 0; + + m_compressed_stream_block = {m_file_read_buffer.get(), m_file_read_buffer_length, 0}; + + reset_stream(); +} + +void ZstdDecompressor::close() { + switch (m_input_type) { + case InputType::MemoryMappedCompressedFile: + if (m_memory_mapped_compressed_file.is_open()) { + // An existing file is memory mapped by the decompressor + m_memory_mapped_compressed_file.close(); + } + break; + case InputType::File: + m_file_read_buffer.reset(); + m_file_read_buffer_capacity = 0; + m_file_read_buffer_length = 0; + m_file_reader = nullptr; + break; + case InputType::CompressedDataBuf: + case InputType::NotInitialized: + // Do nothing + break; + default: + throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__); + } + m_input_type = InputType::NotInitialized; +} + +ErrorCode ZstdDecompressor::open(std::string const& compressed_file_path) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::MemoryMappedCompressedFile; + + // Create memory mapping for compressed_file_path, use boost read only memory mapped file + boost::system::error_code boost_error_code; + size_t compressed_file_size + = boost::filesystem::file_size(compressed_file_path, boost_error_code); + if (boost_error_code) { + SPDLOG_ERROR( + "ZstdDecompressor: Unable to obtain file size for '{}' - {}.", + compressed_file_path.c_str(), + boost_error_code.message().c_str() + ); + return ErrorCodeFailure; + } + + boost::iostreams::mapped_file_params memory_map_params; + memory_map_params.path = compressed_file_path; + memory_map_params.flags = boost::iostreams::mapped_file::readonly; + memory_map_params.length = compressed_file_size; + memory_map_params.hint = m_memory_mapped_compressed_file.data( + ); // Try to map it to the same memory location as previous memory mapped file + m_memory_mapped_compressed_file.open(memory_map_params); + if (false == m_memory_mapped_compressed_file.is_open()) { + SPDLOG_ERROR( + "ZstdDecompressor: Unable to memory map the compressed file with path: {}", + compressed_file_path.c_str() + ); + return ErrorCodeFailure; + } + + // Configure input stream + m_compressed_stream_block = {m_memory_mapped_compressed_file.data(), compressed_file_size, 0}; + + reset_stream(); + + return ErrorCodeSuccess; +} + +void ZstdDecompressor::reset_stream() { + if (InputType::File == m_input_type) { + m_file_reader->seek_from_begin(m_file_reader_initial_pos); + m_file_read_buffer_length = 0; + m_compressed_stream_block.size = m_file_read_buffer_length; + } + + ZSTD_initDStream(m_decompression_stream); + m_decompressed_stream_pos = 0; + + m_compressed_stream_block.pos = 0; +} +} // namespace clp_s diff --git a/components/core/src/clp_s/ZstdDecompressor.hpp b/components/core/src/clp_s/ZstdDecompressor.hpp new file mode 100644 index 000000000..6382d54d3 --- /dev/null +++ b/components/core/src/clp_s/ZstdDecompressor.hpp @@ -0,0 +1,146 @@ +// Code from CLP + +#ifndef CLP_S_ZSTDDECOMPRESSOR_HPP +#define CLP_S_ZSTDDECOMPRESSOR_HPP + +#include +#include + +#include +#include + +#include "Decompressor.hpp" +#include "TraceableException.hpp" + +namespace clp_s { +class ZstdDecompressor : public Decompressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + // Constructor + /** + * @throw Decompressor::OperationFailed if zstd decompressor stream cannot be initialized + */ + ZstdDecompressor(); + + // Destructor + ~ZstdDecompressor(); + + // Explicitly disable copy and move constructor/assignment + ZstdDecompressor(ZstdDecompressor const&) = delete; + + ZstdDecompressor& operator=(ZstdDecompressor const&) = delete; + + // Methods implementing the Decompressor interface + void open(char const* compressed_data_buf, size_t compressed_data_buf_size) override; + + void open(FileReader& file_reader, size_t file_read_buffer_capacity) override; + + void close() override; + + // Methods + /*** + * Initialize streaming decompressor to decompress from a compressed file specified by the given + * path + * @param compressed_file_path + * @param decompressed_stream_block_size + * @return ErrorCodeFailure if the provided path cannot be memory mapped + * @return ErrorCodeSuccess on success + */ + ErrorCode open(std::string const& compressed_file_path); + + // Methods implementing the ReaderInterface + /** + * Tries to read up to a given number of bytes from the decompressor + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return Same as FileReader::try_read if the decompressor is attached to a file + * @return ErrorCodeNotInit if the decompressor is not open + * @return ErrorCodeBadParam if buf is invalid + * @return ErrorCodeEndOfFile on EOF + * @return ErrorCodeFailure on decompression failure + * @return ErrorCodeSuccess on success + */ + ErrorCode try_read(char const* buf, size_t num_bytes_to_read, size_t& num_bytes_read); + + /** + * Tries to read a number of bytes + * @param buf + * @param num_bytes Number of bytes to read + * @return Same as the underlying medium's try_read method + * @return ErrorCodeTruncated if 0 < # bytes read < num_bytes + */ + ErrorCode try_read_exact_length(char* buf, size_t num_bytes); + + /** + * Tries to read a numeric value + * @tparam ValueType + * @param value + * @return Same as the underlying medium's try_read_exact_length method + */ + template + ErrorCode try_read_numeric_value(ValueType& value); + + /** + * Tries to read a string + * @param str_length length of the string to read + * @param str + * @return Same as the underlying medium's try_read_exact_length method + */ + ErrorCode try_read_string(size_t str_length, std::string& str); + +private: + // Enum class + enum class InputType { + NotInitialized, // Note: do nothing but generate an error to prevent this required + // parameter is not initialized properly + CompressedDataBuf, + MemoryMappedCompressedFile, + File + }; + + // Methods + /** + * Reset streaming decompression state so it will start decompressing from the beginning of the + * stream afterwards + */ + void reset_stream(); + + // Variables + InputType m_input_type; + + // Compressed stream variables + ZSTD_DStream* m_decompression_stream; + + boost::iostreams::mapped_file_source m_memory_mapped_compressed_file; + FileReader* m_file_reader; + size_t m_file_reader_initial_pos; + std::unique_ptr m_file_read_buffer; + size_t m_file_read_buffer_length; + size_t m_file_read_buffer_capacity; + + ZSTD_inBuffer m_compressed_stream_block{}; + + size_t m_decompressed_stream_pos; + size_t m_unused_decompressed_stream_block_size; + std::unique_ptr m_unused_decompressed_stream_block_buffer; +}; + +template +ErrorCode ZstdDecompressor::try_read_numeric_value(ValueType& value) { + ErrorCode error_code = try_read_exact_length(reinterpret_cast(&value), sizeof(value)); + if (ErrorCodeSuccess != error_code) { + return error_code; + } + return ErrorCodeSuccess; +} +} // namespace clp_s + +#endif // CLP_S_ZSTDDECOMPRESSOR_HPP diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp new file mode 100644 index 000000000..98de6a4b9 --- /dev/null +++ b/components/core/src/clp_s/clp-s.cpp @@ -0,0 +1,125 @@ +#include + +#include "CommandLineArguments.hpp" +#include "JsonConstructor.hpp" +#include "JsonParser.hpp" +#include "ReaderUtils.hpp" +#include "search/ConvertToExists.hpp" +#include "search/EmptyExpr.hpp" +#include "search/EvaluateTimestampIndex.hpp" +#include "search/kql/kql.hpp" +#include "search/NarrowTypes.hpp" +#include "search/OrOfAndForm.hpp" +#include "search/Output.hpp" +#include "search/SchemaMatch.hpp" +#include "TimestampPattern.hpp" +#include "Utils.hpp" + +using namespace clp_s::search; +using clp_s::CommandLineArguments; + +int main(int argc, char const* argv[]) { + try { + auto stderr_logger = spdlog::stderr_logger_st("stderr"); + spdlog::set_default_logger(stderr_logger); + spdlog::set_pattern("%Y-%m-%dT%H:%M:%S.%e%z [%l] %v"); + } catch (std::exception& e) { + // NOTE: We can't log an exception if the logger couldn't be constructed + return -1; + } + + CommandLineArguments command_line_arguments("clp-s"); + auto parsing_result = command_line_arguments.parse_arguments(argc, argv); + switch (parsing_result) { + case CommandLineArguments::ParsingResult::Failure: + return -1; + case CommandLineArguments::ParsingResult::InfoCommand: + return 0; + case CommandLineArguments::ParsingResult::Success: + // Continue processing + break; + } + + if (CommandLineArguments::Command::Compress == command_line_arguments.get_command()) { + clp_s::TimestampPattern::init(); + + clp_s::JsonParserOption option; + option.file_paths = command_line_arguments.get_file_paths(); + option.archives_dir = command_line_arguments.get_archives_dir(); + option.target_encoded_size = command_line_arguments.get_target_encoded_size(); + option.compression_level = command_line_arguments.get_compression_level(); + auto const& timestamp_key = command_line_arguments.get_timestamp_key(); + if (false == timestamp_key.empty()) { + clp_s::StringUtils::tokenize_column_descriptor(timestamp_key, option.timestamp_column); + } + + clp_s::JsonParser parser(option); + parser.parse(); + parser.store(); + parser.close(); + } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { + clp_s::JsonConstructorOption option; + option.archives_dir = command_line_arguments.get_archives_dir(); + option.output_dir = command_line_arguments.get_output_dir(); + + clp_s::JsonConstructor constructor(option); + constructor.construct(); + constructor.store(); + constructor.close(); + } else { + auto const& archives_dir = command_line_arguments.get_archives_dir(); + auto const& query = command_line_arguments.get_query(); + clp_s::TimestampPattern::init(); + + auto query_stream = std::istringstream(query); + auto expr = kql::parse_kql_expression(query_stream); + + if (std::dynamic_pointer_cast(expr)) { + SPDLOG_ERROR("Query '{}' is logically false", query); + return 1; + } + + OrOfAndForm standardize_pass; + if (expr = standardize_pass.run(expr); std::dynamic_pointer_cast(expr)) { + SPDLOG_ERROR("Query '{}' is logically false", query); + return 1; + } + + NarrowTypes narrow_pass; + if (expr = narrow_pass.run(expr); std::dynamic_pointer_cast(expr)) { + SPDLOG_ERROR("Query '{}' is logically false", query); + return 1; + } + + ConvertToExists convert_pass; + if (expr = convert_pass.run(expr); std::dynamic_pointer_cast(expr)) { + SPDLOG_ERROR("Query '{}' is logically false", query); + return 1; + } + + // skip decompressing the archive if we won't match based on + // the timestamp index + auto timestamp_dict = clp_s::ReaderUtils::read_timestamp_dictionary(archives_dir); + EvaluateTimestampIndex timestamp_index(timestamp_dict); + if (clp_s::EvaluatedValue::False == timestamp_index.run(expr)) { + SPDLOG_ERROR("No matching timestamp ranges for query '{}'", query); + return 1; + } + + auto schema_tree = clp_s::ReaderUtils::read_schema_tree(archives_dir); + auto schemas = clp_s::ReaderUtils::read_schemas(archives_dir); + + // Narrow against schemas + SchemaMatch match_pass(schema_tree, schemas); + if (expr = match_pass.run(expr); std::dynamic_pointer_cast(expr)) { + SPDLOG_ERROR("No matching schemas for query '{}'", query); + return 1; + } + + // output result + Output output(schema_tree, schemas, match_pass, expr, archives_dir, timestamp_dict); + output.filter(); + } + + return 0; +} diff --git a/components/core/src/clp_s/search/AndExpr.cpp b/components/core/src/clp_s/search/AndExpr.cpp new file mode 100644 index 000000000..87a57509e --- /dev/null +++ b/components/core/src/clp_s/search/AndExpr.cpp @@ -0,0 +1,57 @@ +#include "AndExpr.hpp" + +#include + +namespace clp_s::search { +AndExpr::AndExpr(bool inverted, Expression* parent) : Expression(inverted, parent) {} + +AndExpr::AndExpr(AndExpr const& expr) : Expression(expr) {} + +void AndExpr::print() { + auto& os = get_print_stream(); + if (is_inverted()) { + os << "!"; + } + + os << "AndExpr("; + for (auto it = op_begin(); it != op_end();) { + (*it)->print(); + it++; + if (it != op_end()) { + os << ", "; + } + } + os << ")"; + + if (get_parent() == nullptr) { + os << std::endl; + } else { + os << std::flush; + } +} + +std::shared_ptr AndExpr::copy() const { + auto new_expr = std::shared_ptr(new AndExpr(*this)); + for (auto it = new_expr->op_begin(); it != new_expr->op_end(); it++) { + auto expr = std::static_pointer_cast(*it); + expr->copy_replace(new_expr.get(), it); + } + return new_expr; +} + +std::shared_ptr AndExpr::create(bool inverted, Expression* parent) { + return std::shared_ptr(static_cast(new AndExpr(inverted, parent))); +} + +std::shared_ptr AndExpr::create( + std::shared_ptr& op1, + std::shared_ptr& op2, + bool inverted, + Expression* parent +) { + std::shared_ptr expr(static_cast(new AndExpr(inverted, parent))); + op1->copy_append(expr.get()); + op2->copy_append(expr.get()); + return expr; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/AndExpr.hpp b/components/core/src/clp_s/search/AndExpr.hpp new file mode 100644 index 000000000..3ba614ff5 --- /dev/null +++ b/components/core/src/clp_s/search/AndExpr.hpp @@ -0,0 +1,58 @@ +#ifndef CLP_S_SEARCH_ANDEXPR_HPP +#define CLP_S_SEARCH_ANDEXPR_HPP + +#include "Expression.hpp" + +namespace clp_s::search { +/** + * Class representing a logical And operation across all + * children in its OpList. Can have arbitrarily many children. + */ +class AndExpr : public Expression { +public: + void print() override; + + /** + * And expressions only have other expressions as children by construction + */ + bool has_only_expression_operands() override { return true; } + + /** + * Deep copy + * @return A deep copy of this expression + */ + std::shared_ptr copy() const override; + + /** + * Create an empty And expression which can optionally be inverted and attached to a parent. + * Children can be added via mutators inherited from Expression. + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return Newly created Or expression + */ + static std::shared_ptr create(bool inverted = false, Expression* parent = nullptr); + + /** + * Create an And expression with two children + * @param op1 the first child operand + * @param op2 the second child operand + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return Newly created Or expression + */ + static std::shared_ptr create( + std::shared_ptr& op1, + std::shared_ptr& op2, + bool inverted = false, + Expression* parent = nullptr + ); + +private: + // Constructor + explicit AndExpr(bool inverted = false, Expression* parent = nullptr); + + AndExpr(AndExpr const&); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_ANDEXPR_HPP diff --git a/components/core/src/clp_s/search/BooleanLiteral.cpp b/components/core/src/clp_s/search/BooleanLiteral.cpp new file mode 100644 index 000000000..127e085d3 --- /dev/null +++ b/components/core/src/clp_s/search/BooleanLiteral.cpp @@ -0,0 +1,44 @@ +#include "BooleanLiteral.hpp" + +namespace clp_s::search { +std::shared_ptr BooleanLiteral::create_from_bool(bool v) { + return std::shared_ptr(new BooleanLiteral(v)); +} + +std::shared_ptr BooleanLiteral::create_from_string(std::string const& v) { + if (v == "true") { + return std::shared_ptr(new BooleanLiteral(true)); + } else if (v == "false") { + return std::shared_ptr(new BooleanLiteral(false)); + } + + return {nullptr}; +} + +void BooleanLiteral::print() { + auto& os = get_print_stream(); + if (m_v) { + os << "true"; + } else { + os << "false"; + } +} + +bool BooleanLiteral::as_var_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::EQ || op == FilterOperation::NEQ) { + ret = m_v ? "true" : "false"; + return true; + } + + return false; +} + +bool BooleanLiteral::as_bool(bool& ret, FilterOperation op) { + if (op == FilterOperation::EQ || op == FilterOperation::NEQ) { + ret = m_v; + return true; + } + + return false; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/BooleanLiteral.hpp b/components/core/src/clp_s/search/BooleanLiteral.hpp new file mode 100644 index 000000000..af409b81d --- /dev/null +++ b/components/core/src/clp_s/search/BooleanLiteral.hpp @@ -0,0 +1,58 @@ +#ifndef CLP_S_SEARCH_BOOLEANLITERAL_HPP +#define CLP_S_SEARCH_BOOLEANLITERAL_HPP + +#include +#include +#include + +#include "Literal.hpp" + +namespace clp_s::search { +/** + * Class representing a Boolean literal in the search AST + */ +class BooleanLiteral : public Literal { +public: + // Deleted copy + BooleanLiteral(BooleanLiteral const&) = delete; + BooleanLiteral& operator=(BooleanLiteral const&) = delete; + + /** + * Create a bool literal + * @param v the value of the boolean + * @return A Boolean literal + */ + static std::shared_ptr create_from_bool(bool v); + + /** + * Attempt to create a bool literal from a string + * @param v the string we are attempting to convert to bool + * @return A Boolean literal, or nullptr if the string does not represent a bool + */ + static std::shared_ptr create_from_string(std::string const& v); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + bool matches_type(LiteralType type) override { return type & LiteralType::BooleanT; } + + bool matches_any(LiteralTypeBitmask mask) override { return mask & LiteralType::BooleanT; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return mask == LiteralType::BooleanT; } + + bool as_var_string(std::string& ret, FilterOperation op) override; + + bool as_bool(bool& ret, FilterOperation op) override; + +private: + bool m_v; + + // Constructors + BooleanLiteral() = default; + + explicit BooleanLiteral(bool v) : m_v(v){}; +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_BOOLEANLITERAL_HPP diff --git a/components/core/src/clp_s/search/ColumnDescriptor.cpp b/components/core/src/clp_s/search/ColumnDescriptor.cpp new file mode 100644 index 000000000..7c82310ad --- /dev/null +++ b/components/core/src/clp_s/search/ColumnDescriptor.cpp @@ -0,0 +1,90 @@ +#include "ColumnDescriptor.hpp" + +#include + +namespace clp_s::search { +DescriptorList tokenize_descriptor(std::vector const& descriptors) { + DescriptorList list; + for (std::string const& descriptor : descriptors) { + list.push_back(DescriptorToken(descriptor)); + } + return list; +} + +void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() { + m_unresolved_descriptors = false; + m_pure_wildcard = m_descriptors.size() == 1 && m_descriptors[0].wildcard(); + for (auto const& token : m_descriptors) { + if (token.wildcard() || token.regex()) { + m_unresolved_descriptors = true; + break; + } + } +} + +ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) { + m_flags = cAllTypes; + m_descriptors.emplace_back(descriptor); + check_and_set_unresolved_descriptor_flag(); +} + +ColumnDescriptor::ColumnDescriptor(std::vector const& descriptors) { + m_flags = cAllTypes; + m_descriptors = std::move(tokenize_descriptor(descriptors)); + check_and_set_unresolved_descriptor_flag(); +} + +ColumnDescriptor::ColumnDescriptor(DescriptorList const& descriptors) { + m_flags = cAllTypes; + m_descriptors = descriptors; + check_and_set_unresolved_descriptor_flag(); +} + +std::shared_ptr ColumnDescriptor::create(std::string const& descriptor) { + return std::shared_ptr(new ColumnDescriptor(descriptor)); +} + +std::shared_ptr ColumnDescriptor::create( + std::vector const& descriptors +) { + return std::shared_ptr(new ColumnDescriptor(descriptors)); +} + +std::shared_ptr ColumnDescriptor::create(DescriptorList const& descriptors) { + return std::shared_ptr(new ColumnDescriptor(descriptors)); +} + +std::shared_ptr ColumnDescriptor::copy() { + return std::make_shared(*this); +} + +void ColumnDescriptor::print() { + auto& os = get_print_stream(); + os << "ColumnDescriptor<"; + for (uint32_t flag = LiteralType::TypesBegin; flag < LiteralType::TypesEnd; flag <<= 1) { + if (m_flags & flag) { + os << Literal::type_to_string(static_cast(flag)); + + // If there are any types remaining add a comma + if (flag << 1 <= m_flags) { + os << ","; + } + } + } + os << ">("; + + for (auto it = m_descriptors.begin(); it != m_descriptors.end();) { + os << "\"" << (*it).get_token() << "\""; + + it++; + if (it != m_descriptors.end()) { + os << ", "; + } + } + os << ")"; +} + +void ColumnDescriptor::add_unresolved_tokens(DescriptorList::iterator it) { + m_unresolved_tokens.assign(it, descriptor_end()); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp new file mode 100644 index 000000000..b0260eb67 --- /dev/null +++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp @@ -0,0 +1,214 @@ +#ifndef CLP_S_SEARCH_COLUMNDESCRIPTOR_HPP +#define CLP_S_SEARCH_COLUMNDESCRIPTOR_HPP + +#include +#include +#include +#include + +#include "Literal.hpp" + +namespace clp_s::search { +/** + * Class representing a token used to describe one level of hierarchy in a column. + */ +class DescriptorToken { +public: + // Constructors + DescriptorToken() = default; + + /** + * Initialize the token from a string and set flags based on whether the token contains + * wildcards + * @param token the string to initialize the token from + */ + explicit DescriptorToken(std::string const& token) + : m_token(token), + m_wildcard(false), + m_regex(false) { + if (token == "*") { + m_wildcard = true; + } + + for (char c : token) { + if (c == '*') { + m_regex = true; + } + } + } + + /** + * Whether the descriptor is a wildcard + * @return true if the descriptor is a single wildcard + */ + bool wildcard() const { return m_wildcard; } + + /** + * Whether the descriptor contains a wildcard somewhere + * TODO: Not currently used, and regex isn't currently supported + * @return true if the descriptor contains a wildcard + */ + bool regex() const { return m_regex; } + + /** + * Get a reference to the underlying token string + * @return a reference to the underlying string + */ + std::string const& get_token() const { return m_token; } + +private: + bool m_wildcard{}; + bool m_regex{}; + std::string m_token; +}; + +typedef std::vector DescriptorList; + +DescriptorList tokenize_descriptor(std::vector const& descriptors); + +/** + * Class representing a Column in the Search AST. The Column is specified + * by a list of DescriptorTokens which may be wildcards. + * + * Currently only pure wildcard DescriptorTokens are supported -- some descriptor + * in the list of descriptors can be a wildcard, but individual descriptors can not mix + * wildcards with other characters. + */ +class ColumnDescriptor : public Literal { +public: + /** + * Create a ColumnDescriptor literal from an integral value + * @param descriptor(s) the token or list of tokens making up the descriptor + * @return A ColumnDescriptor + */ + static std::shared_ptr create(std::string const& descriptor); + static std::shared_ptr create(std::vector const& descriptors); + static std::shared_ptr create(DescriptorList const& descriptors); + + /** + * Deep copy of this ColumnDescriptor + * @return A deep copy of this Column descriptor + */ + std::shared_ptr copy(); + + /** + * Get iterators to this Column's list of descriptors + * @return Iterators to the beginning and end of the list of descriptors + */ + DescriptorList::iterator descriptor_begin() { return m_descriptors.begin(); } + + DescriptorList::iterator descriptor_end() { return m_descriptors.end(); } + + /** + * @return A reference to the underlying list of descriptors. + * Useful when the descriptors need to be mutated e.g. when being resolved. + */ + DescriptorList& get_descriptor_list() { return m_descriptors; } + + /** + * Set the unresolved tokens for this column descriptor to a suffix of the descriptor list. + * Used for array searches. + * FIXME: this is incredibly confusing to use + * @param it the iterator to start from when setting unresolved tokens to the suffix + */ + void add_unresolved_tokens(DescriptorList::iterator it); + + /** + * Set types this column can match + * @param flags that can be matched by this column + */ + void set_matching_types(LiteralTypeBitmask flags) { m_flags = flags; } + + /** + * Set type this column can match + * @param type that can be matched by this column + */ + void set_matching_type(LiteralType type) { m_flags = type; } + + /** + * Remove types from set of types this column can match + * @param flags to be removed + */ + void remove_matching_types(LiteralTypeBitmask flags) { m_flags &= ~flags; } + + /** + * Remove type from set of types this column can match + * @param type to be removed + */ + void remove_matching_type(LiteralType type) { m_flags &= ~type; } + + /** + * @return the CLJ column Id this Column represents. Garbage value if it was never set. + */ + int32_t get_column_id() const { return m_id; } + + /** + * Set the CLJ column Id this column represents + * @param id the CLJ column Id to set this column to + */ + void set_column_id(int32_t id) { m_id = id; } + + /** + * Get the list of unresolved tokens used for array search + * @return the list of unresolved tokens + * FIXME: should be reference? + */ + DescriptorList get_unresolved_tokens() const { return m_unresolved_tokens; } + + /** + * Whether the Column has any unresolved tokens for array search + * @return true if there are unresolved tokens for array search + */ + bool has_unresolved_tokens() const { return !m_unresolved_tokens.empty(); } + + // Safe only if this column has been explicitly set to + // only have a single type + LiteralType get_literal_type() const { return static_cast(m_flags); } + + /** + * Whether the list of Descriptor's contains any wildcards + * @return true if the descriptor contains any wildcards that need to be resolved + */ + bool is_unresolved_descriptor() const { return m_unresolved_descriptors; } + + /** + * Whether this Column is a single wildcard + * @return true if this descriptor is just a single wildcard + */ + bool is_pure_wildcard() const { return m_pure_wildcard; } + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + // ColumnDescriptor can implicitly match several different types at the same time. + bool matches_type(LiteralType type) override { return m_flags & type; } + + bool matches_any(LiteralTypeBitmask mask) override { return m_flags & mask; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return m_flags == mask; } + +private: + DescriptorList m_descriptors; // list of descriptors describing the column + DescriptorList m_unresolved_tokens; // unresolved tokens used for array search + LiteralTypeBitmask m_flags; // set of types this column can match + int32_t m_id; // unambiguous CLJ column id this column represents. May be unset. + bool m_unresolved_descriptors; // true if contains wildcards + bool m_pure_wildcard; // true if column is single wildcard + + // Constructors + explicit ColumnDescriptor(std::string const&); + + explicit ColumnDescriptor(std::vector const&); + + explicit ColumnDescriptor(DescriptorList const&); + + /** + * Scan the list of descriptors to check if they contain wildcards and + * set the appropriate flags. + */ + void check_and_set_unresolved_descriptor_flag(); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_COLUMNDESCRIPTOR_HPP diff --git a/components/core/src/clp_s/search/ConstantProp.cpp b/components/core/src/clp_s/search/ConstantProp.cpp new file mode 100644 index 000000000..0f19288bd --- /dev/null +++ b/components/core/src/clp_s/search/ConstantProp.cpp @@ -0,0 +1,43 @@ +#include "ConstantProp.hpp" + +#include + +#include "AndExpr.hpp" +#include "EmptyExpr.hpp" +#include "OrExpr.hpp" + +namespace clp_s::search { +std::shared_ptr ConstantProp::run(std::shared_ptr& expr) { + return propagate_empty(expr); +} + +std::shared_ptr ConstantProp::propagate_empty(std::shared_ptr cur) { + if (std::dynamic_pointer_cast(cur)) { + std::vector deleted; + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + auto new_child = propagate_empty(std::static_pointer_cast(*it)); + if (std::dynamic_pointer_cast(new_child)) { + deleted.push_back(it); + } + } + + if (deleted.size() == cur->get_op_list().size()) { + return EmptyExpr::create(cur->get_parent()); + } + + for (auto const& it : deleted) { + cur->get_op_list().erase(it); + } + } else if (std::dynamic_pointer_cast(cur)) { + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + auto new_child = propagate_empty(std::static_pointer_cast(*it)); + if (std::dynamic_pointer_cast(new_child)) { + new_child->set_parent(cur->get_parent()); + return new_child; + } + } + } + + return cur; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/ConstantProp.hpp b/components/core/src/clp_s/search/ConstantProp.hpp new file mode 100644 index 000000000..b17032001 --- /dev/null +++ b/components/core/src/clp_s/search/ConstantProp.hpp @@ -0,0 +1,23 @@ +#ifndef CLP_S_SEARCH_CONSTANTPROP_HPP +#define CLP_S_SEARCH_CONSTANTPROP_HPP + +#include "Transformation.hpp" + +namespace clp_s::search { +// Constant propagate empty expressions keeping all remaining data IN PLACE +class ConstantProp : public Transformation { +public: + // Methods inherited from Transformation + std::shared_ptr run(std::shared_ptr& expr) override; + +private: + /** + * Propagate empty expressions through the expression tree + * @param cur + * @return A new expression with empty expressions propagated + */ + static std::shared_ptr propagate_empty(std::shared_ptr cur); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_CONSTANTPROP_HPP diff --git a/components/core/src/clp_s/search/ConvertToExists.cpp b/components/core/src/clp_s/search/ConvertToExists.cpp new file mode 100644 index 000000000..c926a0552 --- /dev/null +++ b/components/core/src/clp_s/search/ConvertToExists.cpp @@ -0,0 +1,116 @@ +#include "ConvertToExists.hpp" + +#include "ColumnDescriptor.hpp" +#include "ConstantProp.hpp" +#include "EmptyExpr.hpp" +#include "FilterExpr.hpp" +#include "Literal.hpp" +#include "OrExpr.hpp" +#include "OrOfAndForm.hpp" + +namespace clp_s::search { +std::shared_ptr ConvertToExists::run(std::shared_ptr& expr) { + expr = convert(expr); + + if (m_needs_standard_form) { + OrOfAndForm pass; + expr = pass.run(expr); + } + + if (m_needs_constant_prop) { + ConstantProp pass; + expr = pass.run(expr); + } + + return expr; +} + +std::shared_ptr ConvertToExists::convert(std::shared_ptr cur) { + if (cur->has_only_expression_operands()) { + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + auto child = std::static_pointer_cast(*it); + auto new_child = convert(child); + if (new_child != child) { + new_child->copy_replace(cur.get(), it); + } + } + } else if (auto filter = std::dynamic_pointer_cast(cur)) { + // TODO: will have to change if we start supporting multi column expressions + auto column = filter->get_column(); + auto op = filter->get_operation(); + + if (op == FilterOperation::EXISTS || op == FilterOperation::NEXISTS) { + if (false == filter->is_inverted()) { + return cur; + } + + FilterOperation new_op = (op == FilterOperation::EXISTS) ? FilterOperation::NEXISTS + : FilterOperation::EXISTS; + auto new_col = column->copy(); + return FilterExpr::create(new_col, new_op); + } + + auto literal = filter->get_operand(); + + bool exists; + if (filter->is_inverted()) { + exists = op == FilterOperation::NEQ; + } else { + exists = op == FilterOperation::EQ; + } + + if (literal->as_any(op)) { + auto new_col = column->copy(); + if (exists) { + return FilterExpr::create(new_col, FilterOperation::EXISTS); + } else { + return FilterExpr::create(new_col, FilterOperation::NEXISTS); + } + } else if (literal->as_null(op)) { + auto new_col = column->copy(); + auto new_col_null = column->copy(); + if (exists) { + m_needs_standard_form = true; + new_col->remove_matching_types( + cAllTypes + & ~(LiteralType::ArrayT | LiteralType::ClpStringT | LiteralType::VarStringT) + ); + new_col_null->remove_matching_types(cAllTypes & ~LiteralType::NullT); + std::shared_ptr non_null_filter; + if (new_col->matches_any(cAllTypes)) { + non_null_filter = FilterExpr::create(new_col, FilterOperation::EQ); + non_null_filter->add_operand(literal); + } else { + non_null_filter = EmptyExpr::create(); + m_needs_constant_prop = true; + } + + std::shared_ptr null_filter; + if (new_col_null->matches_any(cAllTypes)) { + null_filter = FilterExpr::create(new_col_null, FilterOperation::EXISTS); + } else { + null_filter = EmptyExpr::create(); + m_needs_constant_prop = true; + } + + return OrExpr::create(null_filter, non_null_filter); + } else { + if (new_col->matches_type(LiteralType::NullT)) { + // != null supercedes all other types + new_col->set_matching_types(cAllTypes & ~LiteralType::NullT); + return FilterExpr::create(new_col, FilterOperation::EXISTS); + } else { + new_col->remove_matching_type(LiteralType::NullT); + if (new_col->matches_any(cAllTypes)) { + return FilterExpr::create(new_col, FilterOperation::EXISTS); + } else { + m_needs_constant_prop = true; + return EmptyExpr::create(); + } + } + } + } + } + return cur; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/ConvertToExists.hpp b/components/core/src/clp_s/search/ConvertToExists.hpp new file mode 100644 index 000000000..6e417806a --- /dev/null +++ b/components/core/src/clp_s/search/ConvertToExists.hpp @@ -0,0 +1,29 @@ +#ifndef CLP_S_SEARCH_CONVERTTOEXISTS_HPP +#define CLP_S_SEARCH_CONVERTTOEXISTS_HPP + +#include "Transformation.hpp" + +namespace clp_s::search { +// Must run after NarrowTypes pass +class ConvertToExists : public Transformation { +public: + // Constructors + ConvertToExists() : m_needs_constant_prop(false), m_needs_standard_form(false) {} + + // Methods inherited from Transformation + std::shared_ptr run(std::shared_ptr& expr) override; + +private: + bool m_needs_constant_prop; + bool m_needs_standard_form; + + /** + * Convert an expression to exists form + * @param cur the expression to convert + * @return A new expression in exists form + */ + std::shared_ptr convert(std::shared_ptr cur); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_CONVERTTOEXISTS_HPP diff --git a/components/core/src/clp_s/search/DateLiteral.cpp b/components/core/src/clp_s/search/DateLiteral.cpp new file mode 100644 index 000000000..6296baa64 --- /dev/null +++ b/components/core/src/clp_s/search/DateLiteral.cpp @@ -0,0 +1,92 @@ +#include "DateLiteral.hpp" + +#include + +#include "../TimestampPattern.hpp" +#include "SearchUtils.hpp" + +namespace clp_s::search { +DateLiteral::DateLiteral(double v, std::string s) : Integral(v), m_epoch_str(std::move(s)) {} + +DateLiteral::DateLiteral(epochtime_t v, std::string s) : Integral(v), m_epoch_str(std::move(s)) {} + +std::shared_ptr DateLiteral::create_from_float(double v) { + std::ostringstream s; + s << v; + s.str(); + return std::shared_ptr(static_cast(new DateLiteral(v, s.str()))); +} + +std::shared_ptr DateLiteral::create_from_int(epochtime_t v) { + std::ostringstream s; + s << v; + s.str(); + return std::shared_ptr(static_cast(new DateLiteral(v, s.str()))); +} + +std::shared_ptr DateLiteral::create_from_string(std::string const& v) { + std::istringstream ss(v); + epochtime_t tmp_int_epoch; + double tmp_double_epoch; + + ss >> std::noskipws >> tmp_int_epoch; + if (false == ss.fail() && ss.eof()) { + return std::shared_ptr(static_cast(new DateLiteral(tmp_int_epoch, v))); + } + + ss = std::istringstream(v); + ss >> std::noskipws >> tmp_double_epoch; + if (false == ss.fail() && ss.eof()) { + return std::shared_ptr(static_cast(new DateLiteral(tmp_double_epoch, v)) + ); + } + + // begin end arguments are returned only -- their value doesn't matter + size_t timestamp_begin_pos = 0, timestamp_end_pos = 0; + auto pattern = TimestampPattern::search_known_ts_patterns( + v, + tmp_int_epoch, + timestamp_begin_pos, + timestamp_end_pos + ); + if (pattern == nullptr) { + return std::shared_ptr(nullptr); + } + + return std::shared_ptr(static_cast(new DateLiteral(tmp_int_epoch, v))); +} + +void DateLiteral::print() { + get_print_stream() << m_epoch_str; +} + +bool DateLiteral::as_clp_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + + if (m_epoch_str.find(' ') == std::string::npos) { + return false; + } + + ret = m_epoch_str; + return true; +} + +bool DateLiteral::as_var_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + + if (m_epoch_str.find(' ') != std::string::npos) { + return false; + } + + ret = m_epoch_str; + return true; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/DateLiteral.hpp b/components/core/src/clp_s/search/DateLiteral.hpp new file mode 100644 index 000000000..09df3fb03 --- /dev/null +++ b/components/core/src/clp_s/search/DateLiteral.hpp @@ -0,0 +1,65 @@ +#ifndef CLP_S_SEARCH_DATELITERAL_HPP +#define CLP_S_SEARCH_DATELITERAL_HPP + +#include + +#include "../Defs.hpp" +#include "Integral.hpp" + +namespace clp_s::search { +constexpr LiteralTypeBitmask cDateLiteralTypes = EpochDateT | FloatDateT; + +/** + * Class for Date literal in the search AST. Represents time + * in epoch time. + */ +class DateLiteral : public Integral { +public: + // Deleted copy + DateLiteral(DateLiteral const&) = delete; + DateLiteral& operator=(DateLiteral const&) = delete; + + /** + * Create a Date literal from an integral value + * @param v the time as a double or epoch + * @return A Date literal + */ + static std::shared_ptr create_from_float(double v); + static std::shared_ptr create_from_int(epochtime_t v); + + /** + * Attempt to create a Date literal from string. Tries to parse the string using + * TimestampPattern. + * @return A Date Literal or nullptr if the string can not be parsed as date. + */ + static std::shared_ptr create_from_string(std::string const& v); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + bool matches_type(LiteralType type) override { return type & cDateLiteralTypes; } + + bool matches_any(LiteralTypeBitmask mask) override { return mask & cDateLiteralTypes; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return mask == cDateLiteralTypes; } + + bool as_epoch_date() override { return true; } + + bool as_float_date() override { return true; } + + bool as_clp_string(std::string& ret, FilterOperation op) override; + + bool as_var_string(std::string& ret, FilterOperation op) override; + +private: + std::string m_epoch_str; + + // Constructors + explicit DateLiteral(double v, std::string s); + + explicit DateLiteral(epochtime_t v, std::string s); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_DATELITERAL_HPP diff --git a/components/core/src/clp_s/search/EmptyExpr.cpp b/components/core/src/clp_s/search/EmptyExpr.cpp new file mode 100644 index 000000000..201ef739a --- /dev/null +++ b/components/core/src/clp_s/search/EmptyExpr.cpp @@ -0,0 +1,27 @@ +#include "EmptyExpr.hpp" + +namespace clp_s::search { +EmptyExpr::EmptyExpr(Expression* parent) : Expression(false, parent) {} + +EmptyExpr::EmptyExpr(EmptyExpr const& expr) : Expression(expr) {} + +std::shared_ptr EmptyExpr::create(Expression* parent) { + return std::shared_ptr(static_cast(new EmptyExpr(parent))); +} + +void EmptyExpr::print() { + auto& os = get_print_stream(); + os << "EmptyExpr()"; + + if (get_parent() == nullptr) { + os << std::endl; + } else { + os << std::flush; + } +} + +std::shared_ptr EmptyExpr::copy() const { + // Copy on EmptyExpr can use default shallow copy + return std::shared_ptr(static_cast(new EmptyExpr(*this))); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/EmptyExpr.hpp b/components/core/src/clp_s/search/EmptyExpr.hpp new file mode 100644 index 000000000..3a002eafd --- /dev/null +++ b/components/core/src/clp_s/search/EmptyExpr.hpp @@ -0,0 +1,37 @@ +#ifndef CLP_S_SEARCH_EMPTYEXPR_HPP +#define CLP_S_SEARCH_EMPTYEXPR_HPP + +#include "Expression.hpp" + +namespace clp_s::search { +/** + * Class representing the empty set/false. Useful + * for constant propagation and eliminating expressions. + */ +class EmptyExpr : public Expression { +public: + /** + * Create an Empty expression which can optionally be attached to a parent + * @param parent parent this expression is attached to + * @return newly created Empty expression + */ + static std::shared_ptr create(Expression* parent = nullptr); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Expression + // EmptyExpr never has any operands, so we arbitrarily say that all operands are Expression + bool has_only_expression_operands() override { return true; } + + std::shared_ptr copy() const override; + +private: + // Constructor + explicit EmptyExpr(Expression* parent = nullptr); + + EmptyExpr(EmptyExpr const&); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_EMPTYEXPR_HPP diff --git a/components/core/src/clp_s/search/EvaluateTimestampIndex.cpp b/components/core/src/clp_s/search/EvaluateTimestampIndex.cpp new file mode 100644 index 000000000..7e3339f10 --- /dev/null +++ b/components/core/src/clp_s/search/EvaluateTimestampIndex.cpp @@ -0,0 +1,103 @@ +#include "EvaluateTimestampIndex.hpp" + +#include "AndExpr.hpp" +#include "FilterExpr.hpp" +#include "Integral.hpp" +#include "OrExpr.hpp" + +namespace clp_s::search { +constexpr LiteralTypeBitmask cDateTypes = cIntegralTypes | EpochDateT | FloatDateT; + +EvaluatedValue EvaluateTimestampIndex::run(std::shared_ptr const& expr) { + if (std::dynamic_pointer_cast(expr)) { + bool any_unkown = false; + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + EvaluatedValue ret = run(sub_expr); + if (ret == EvaluatedValue::True) { + return expr->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } else if (ret == EvaluatedValue::Unknown) { + any_unkown = true; + } + } + + if (any_unkown) { + return EvaluatedValue::Unknown; + } + // must have been all false + return expr->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } else if (std::dynamic_pointer_cast(expr)) { + bool any_unkown = false; + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + EvaluatedValue ret = run(sub_expr); + if (ret == EvaluatedValue::False) { + return expr->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } else if (ret == EvaluatedValue::Unknown) { + any_unkown = true; + } + } + + if (any_unkown) { + return EvaluatedValue::Unknown; + } + // must have been all true + return expr->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } else if (auto filter = std::dynamic_pointer_cast(expr)) { + auto column = filter->get_column(); + if (false == column->matches_any(cDateTypes)) { + return EvaluatedValue::Unknown; + } + + for (auto range_it = m_timestamp_dict->tokenized_column_to_range_begin(); + range_it != m_timestamp_dict->tokenized_column_to_range_end(); + range_it++) + { + std::vector& tokens = range_it->first; + auto const& descriptors = column->get_descriptor_list(); + // TODO: handle wildcard matching; the initial check on timestamp index happens + // before schema matching, so + if (tokens.size() != descriptors.size()) { + continue; + } + + bool matched = true; + for (size_t i = 0; i < descriptors.size(); ++i) { + if (tokens[i] != descriptors[i].get_token()) { + matched = false; + break; + } + } + if (false == matched) { + continue; + } + + EvaluatedValue ret; + // this is safe after type narrowing because all DateType literals are either + // Integral or a derived class of Integral + Integral64 literal = std::static_pointer_cast(filter->get_operand())->get(); + if (std::holds_alternative(literal)) { + ret = range_it->second->evaluate_filter( + filter->get_operation(), + std::get(literal) + ); + } else { + ret = range_it->second->evaluate_filter( + filter->get_operation(), + std::get(literal) + ); + } + + if (ret == EvaluatedValue::True) { + return filter->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } else if (ret == EvaluatedValue::False) { + return filter->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } + return EvaluatedValue::Unknown; + } + return EvaluatedValue::Unknown; + } else { + return EvaluatedValue::Unknown; + } +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/EvaluateTimestampIndex.hpp b/components/core/src/clp_s/search/EvaluateTimestampIndex.hpp new file mode 100644 index 000000000..9799ec68d --- /dev/null +++ b/components/core/src/clp_s/search/EvaluateTimestampIndex.hpp @@ -0,0 +1,31 @@ +#ifndef CLP_S_SEARCH_EVALUATETIMESTAMPINDEX_HPP +#define CLP_S_SEARCH_EVALUATETIMESTAMPINDEX_HPP + +#include "../TimestampDictionaryReader.hpp" +#include "../Utils.hpp" +#include "Expression.hpp" + +namespace clp_s::search { +class EvaluateTimestampIndex { +public: + // Constructors + EvaluateTimestampIndex(std::shared_ptr const& timestamp_dict) + : m_timestamp_dict(timestamp_dict) {} + + /** + * Takes an expression and attempts to prove its output (true/false/unknown) based on + * a timestamp index. Currently doesn't do any constant propagation. + * + * Should only be run after type narrowing. + * + * @param expr the expression to evaluate against the timestamp index + * @return The evaluated value of the expression given the index (True, False, Unknown) + */ + EvaluatedValue run(std::shared_ptr const& expr); + +private: + std::shared_ptr m_timestamp_dict; +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_EVALUATETIMESTAMPINDEX_HPP diff --git a/components/core/src/clp_s/search/Expression.cpp b/components/core/src/clp_s/search/Expression.cpp new file mode 100644 index 000000000..45c5b1168 --- /dev/null +++ b/components/core/src/clp_s/search/Expression.cpp @@ -0,0 +1,35 @@ +#include "Expression.hpp" + +namespace clp_s::search { +Expression::Expression(bool inverted, Expression* parent) { + m_inverted = inverted; + m_parent = parent; +} + +Expression::Expression(Expression const& expr) { + m_parent = nullptr; + m_inverted = expr.m_inverted; + m_operands = expr.m_operands; +} + +void Expression::add_operand(std::shared_ptr const& operand) { + m_operands.push_back(std::static_pointer_cast(operand)); + operand->set_parent(this); +} + +void Expression::add_operand(std::shared_ptr const& operand) { + m_operands.push_back(std::static_pointer_cast(operand)); +} + +void Expression::copy_append(Expression* parent) const { + auto new_expr = this->copy(); + new_expr->set_parent(parent); + parent->add_operand(new_expr); +} + +void Expression::copy_replace(Expression* parent, OpList::iterator it) const { + auto new_expr = this->copy(); + new_expr->set_parent(parent); + *it = std::static_pointer_cast(new_expr); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/Expression.hpp b/components/core/src/clp_s/search/Expression.hpp new file mode 100644 index 000000000..3b67bc16e --- /dev/null +++ b/components/core/src/clp_s/search/Expression.hpp @@ -0,0 +1,118 @@ +#ifndef CLP_S_SEARCH_EXPRESSION_HPP +#define CLP_S_SEARCH_EXPRESSION_HPP + +#include +#include + +#include "Literal.hpp" +#include "Value.hpp" + +namespace clp_s::search { +typedef std::list> OpList; + +/** + * Top level class for all logical expressions which represent filters + * on columns. + * + * Key subclasses are AndExpr, OrExpr, and FilterExpr + */ +class Expression : public Value { +public: + /** + * True if this expression is inverted + * @return Whether the expression is inverted + */ + bool is_inverted() const { return m_inverted; } + + /** + * Flip whether the expression is inverted + */ + void invert() { m_inverted = !m_inverted; } + + /** + * @return The number of operands that this expression has + */ + unsigned get_num_operands() override { return m_operands.size(); } + + /** + * Get iterators to this Expression's OpList + * @return Iterators to the beggining/end of the OpList + */ + OpList::iterator op_begin() { return m_operands.begin(); } + + OpList::iterator op_end() { return m_operands.end(); } + + /** + * @return A reference to the underlying OpList. Useful in cases where certain children + * need to be deleted, or multiple children need to be spliced in. + */ + OpList& get_op_list() { return m_operands; } + + /** + * Add an operand to the end of the OpList. When the operand is an + * Expression its parent is set to this Expression. + * @param operand the operand to append to the OpList + */ + void add_operand(std::shared_ptr const& operand); + + void add_operand(std::shared_ptr const& operand); + + /** + * @return The parent for this Expression. Can be nullptr if this is the top level. + */ + Expression* get_parent() { return m_parent; } + + /** + * Set the parent for this Expression + * @param parent this Expression's new parent + */ + void set_parent(Expression* parent) { m_parent = parent; } + + /** + * Deep copy + * @return A deep copy of this expression + */ + virtual std::shared_ptr copy() const = 0; + + /** + * Deep copy this expression and append it into *parent*'s OpList. + * Also sets the parent for copy to parent. + * @param parent the parent to copy into + */ + void copy_append(Expression* parent) const; + + /** + * Deep copy this expression and replace a specific operand in the + * *parent*'s OpList. + * @param parent the parent to copy into + * @param it an iterator into the parent's OpList representing the operand that will get + * replaced + */ + void copy_replace(Expression* parent, OpList::iterator it) const; + + /** + * Whether this Expression's operands are all Expression + * @return true if this Expression's operands are all Expression + */ + virtual bool has_only_expression_operands() = 0; + + // Methods inherited from Value + void print() override = 0; + +protected: + /** + * All expressions can be inverted, have a parent (nullptr for top level), + * and have 0 or more operands + */ + bool m_inverted; + Expression* m_parent; + std::list> m_operands; + + // Copy Semantic is create shallow copy with parent pointing to null + Expression(Expression const&); + + explicit Expression(bool inverted, Expression* parent = nullptr); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_EXPRESSION_HPP diff --git a/components/core/src/clp_s/search/FilterExpr.cpp b/components/core/src/clp_s/search/FilterExpr.cpp new file mode 100644 index 000000000..55c62733c --- /dev/null +++ b/components/core/src/clp_s/search/FilterExpr.cpp @@ -0,0 +1,106 @@ +#include "FilterExpr.hpp" + +namespace clp_s::search { +FilterExpr::FilterExpr( + std::shared_ptr const& column, + FilterOperation op, + bool inverted, + Expression* parent +) + : Expression(inverted, parent) { + m_op = op; + add_operand(std::static_pointer_cast(column)); +} + +FilterExpr::FilterExpr(FilterExpr const& expr) : Expression(expr) { + m_op = expr.m_op; +} + +std::string FilterExpr::op_type_str(FilterOperation op) { + switch (op) { + case FilterOperation::EXISTS: + return "EXISTS"; + case FilterOperation::EQ: + return "EQ"; + case FilterOperation::NEQ: + return "NEQ"; + case FilterOperation::LT: + return "LT"; + case FilterOperation::GT: + return "GT"; + case FilterOperation::LTE: + return "LTE"; + case FilterOperation::GTE: + return "GTE"; + default: + return "UNKNOWN"; + } +} + +void FilterExpr::print() { + auto& os = get_print_stream(); + if (is_inverted()) { + os << "!"; + } + + os << "FilterExpr("; + os << op_type_str(m_op); + for (auto it = op_begin(); it != op_end(); it++) { + os << ", "; + (*it)->print(); + } + os << ")"; + + if (get_parent() == nullptr) { + os << std::endl; + } else { + os << std::flush; + } +} + +std::shared_ptr FilterExpr::create( + std::shared_ptr& column, + FilterOperation op, + bool inverted, + Expression* parent +) { + return std::shared_ptr( + static_cast(new FilterExpr(column->copy(), op, inverted, parent)) + ); +} + +std::shared_ptr FilterExpr::create( + std::shared_ptr& column, + FilterOperation op, + std::shared_ptr& operand, + bool inverted, + Expression* parent +) { + std::shared_ptr expr( + static_cast(new FilterExpr(column->copy(), op, inverted, parent)) + ); + expr->add_operand(operand); + return expr; +} + +std::shared_ptr FilterExpr::copy() const { + // Only deep copy column descriptors + auto new_filter = std::shared_ptr(static_cast(new FilterExpr(*this))); + for (auto it = new_filter->op_begin(); it != new_filter->op_end(); it++) { + if (auto descriptor = std::dynamic_pointer_cast(*it)) { + *it = descriptor->copy(); + } + } + return new_filter; +} + +std::shared_ptr FilterExpr::get_operand() { + auto it = op_begin(); + it++; + if (it == op_end()) { + return nullptr; + } else { + return std::static_pointer_cast(*it); + } +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/FilterExpr.hpp b/components/core/src/clp_s/search/FilterExpr.hpp new file mode 100644 index 000000000..706a375c6 --- /dev/null +++ b/components/core/src/clp_s/search/FilterExpr.hpp @@ -0,0 +1,100 @@ +#ifndef CLP_S_SEARCH_FILTEREXPR_HPP +#define CLP_S_SEARCH_FILTEREXPR_HPP + +#include + +#include "ColumnDescriptor.hpp" +#include "Expression.hpp" +#include "FilterOperation.hpp" +#include "Literal.hpp" + +namespace clp_s::search { +/** + * Class for simple filter conditions in the AST. Consists of a column, + * a filtering operation, and usually a literal. + * + * Conventionally the OpList contains a ColumnExpr followed by some Literal. I.e. a FilterExpr + * always has a ColumnExpr, but may not have a Literal. + */ +class FilterExpr : public Expression { +public: + /** + * @return FilterOperation this Filter performs + */ + FilterOperation get_operation() { return m_op; } + + /** + * @return The Column this Filter acts on + */ + std::shared_ptr get_column() { + return std::static_pointer_cast(*op_begin()); + } + + /** + * @return This Filter's Literal or nullptr if there is no Literal + */ + std::shared_ptr get_operand(); + + /** + * Create a Filter expression with a Column and FilterOperation but no Literal + * Literal can be added later using mutators provided by the Expression parent class + * @param column the Column this Filter acts on + * @param op the Operation this Filter uses to Filter the Column + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return Newly created Or expression + */ + static std::shared_ptr create( + std::shared_ptr& column, + FilterOperation op, + bool inverted = false, + Expression* parent = nullptr + ); + + /** + * Create a Filter expression with a Column, FilterOperation and Literal + * @param column the Column this Filter acts on + * @param op the Operation this Filter uses to Filter the Column + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return newly created Or expression + */ + static std::shared_ptr create( + std::shared_ptr& column, + FilterOperation op, + std::shared_ptr& operand, + bool inverted = false, + Expression* parent = nullptr + ); + + /** + * Helper function to turn FilterOperation into string for printing + * @param op the operation we want to convert to string + * @return a string representing the operation + */ + static std::string op_type_str(FilterOperation op); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Expression + bool has_only_expression_operands() override { return false; } + + std::shared_ptr copy() const override; + +private: + FilterOperation m_op; + + // Constructor + FilterExpr( + std::shared_ptr const& column, + FilterOperation op, + bool inverted = false, + Expression* parent = nullptr + ); + + FilterExpr(FilterExpr const&); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_FILTEREXPR_HPP diff --git a/components/core/src/clp_s/search/FilterOperation.hpp b/components/core/src/clp_s/search/FilterOperation.hpp new file mode 100644 index 000000000..e484e7098 --- /dev/null +++ b/components/core/src/clp_s/search/FilterOperation.hpp @@ -0,0 +1,20 @@ +#ifndef CLP_S_SEARCH_FILTEROPERATION_HPP +#define CLP_S_SEARCH_FILTEROPERATION_HPP + +namespace clp_s::search { +/** + * Enum describing all supported filtering operations in the search AST + */ +enum FilterOperation { + EXISTS, + NEXISTS, + EQ, + NEQ, + LT, + GT, + LTE, + GTE +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_FILTEROPERATION_HPP diff --git a/components/core/src/clp_s/search/Integral.cpp b/components/core/src/clp_s/search/Integral.cpp new file mode 100644 index 000000000..459ac9dc0 --- /dev/null +++ b/components/core/src/clp_s/search/Integral.cpp @@ -0,0 +1,96 @@ +#include "Integral.hpp" + +#include + +#include "SearchUtils.hpp" + +namespace clp_s::search { +Integral::Integral(double v) : m_v(v) {} + +Integral::Integral(int64_t v) : m_v(v) {} + +std::shared_ptr Integral::create_from_float(double v) { + return std::shared_ptr(static_cast(new Integral(v))); +} + +std::shared_ptr Integral::create_from_int(int64_t v) { + return std::shared_ptr(static_cast(new Integral(v))); +} + +std::shared_ptr Integral::create_from_string(std::string const& v) { + Integral* ret = nullptr; + int64_t tmpint; + std::istringstream ss(v); + ss >> std::noskipws >> tmpint; + if (false == ss.fail() && ss.eof()) { + ret = new Integral(tmpint); + ret->m_vstr = v; + return std::shared_ptr(static_cast(ret)); + } + + double tmpdouble; + ss = std::istringstream(v); + ss >> std::noskipws >> tmpdouble; + if (false == ss.fail() && ss.eof()) { + ret = new Integral(tmpdouble); + ret->m_vstr = v; + return std::shared_ptr(static_cast(ret)); + } + return std::shared_ptr(static_cast(ret)); +} + +void Integral::print() { + auto& os = get_print_stream(); + if (false == m_vstr.empty()) { + os << m_vstr; + } else if (std::holds_alternative(m_v)) { + os << std::get(m_v); + } else { + os << std::get(m_v); + } +} + +Integral64 Integral::get() { + return m_v; +} + +bool Integral::as_var_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + if (false == m_vstr.empty()) { + ret = m_vstr; + } else { + std::ostringstream ss; + if (std::holds_alternative(m_v)) { + ss << std::get(m_v); + } else { + ss << std::get(m_v); + } + m_vstr = ss.str(); + ret = m_vstr; + } + return true; +} + +bool Integral::as_float(double& ret, FilterOperation op) { + if (std::holds_alternative(m_v)) { + ret = std::get(m_v); + } else { + ret = std::get(m_v); + } + return true; +} + +bool Integral::as_int(int64_t& ret, FilterOperation op) { + if (std::holds_alternative(m_v)) { + double tmp = std::get(m_v); + return double_as_int(tmp, op, ret); + } else { + ret = std::get(m_v); + } + return true; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/Integral.hpp b/components/core/src/clp_s/search/Integral.hpp new file mode 100644 index 000000000..eb619deed --- /dev/null +++ b/components/core/src/clp_s/search/Integral.hpp @@ -0,0 +1,84 @@ +#ifndef CLP_S_SEARCH_INTEGRAL_HPP +#define CLP_S_SEARCH_INTEGRAL_HPP + +#include +#include +#include + +#include "Literal.hpp" + +namespace clp_s::search { +typedef std::variant Integral64; + +// FIXME: figure out why String types are part of this bitmask +constexpr LiteralTypeBitmask cIntegralLiteralTypes = cIntegralTypes | VarStringT; + +/** + * Class for Integral values (float/int) in the search AST + */ +class Integral : public Literal { +public: + // Deleted copy + Integral(Integral const&) = delete; + + Integral& operator=(Integral const&) = delete; + + /** + * Create an Integral literal from an double value + * @param v the value + * @return an Integral literal + */ + static std::shared_ptr create_from_float(double v); + + /** + * Create an Integral literal from an integral value + * @param v the value + * @return an Integral literal + */ + static std::shared_ptr create_from_int(int64_t v); + + /** + * Try to create an integral literal from a string + * @param v the string we are attempting to convert to Integral + * @return an Integral literal, or nullptr if the string does not represent an integral + */ + static std::shared_ptr create_from_string(std::string const& v); + + /** + * Return the underlying integral value + * @return the underlying integral value + */ + Integral64 get(); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + bool matches_type(LiteralType type) override { return type & cIntegralLiteralTypes; } + + bool matches_any(LiteralTypeBitmask mask) override { return mask & cIntegralLiteralTypes; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return mask == cIntegralLiteralTypes; } + + bool as_epoch_date() override { return true; } + + bool as_float_date() override { return true; } + + bool as_var_string(std::string& ret, FilterOperation op) override; + + bool as_float(double& ret, FilterOperation op) override; + + bool as_int(int64_t& ret, FilterOperation op) override; + +protected: + Integral64 m_v; + std::string m_vstr; // original string representation if created from string + + // Constructors + explicit Integral(double v); + + explicit Integral(int64_t v); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_INTEGRAL_HPP diff --git a/components/core/src/clp_s/search/Literal.hpp b/components/core/src/clp_s/search/Literal.hpp new file mode 100644 index 000000000..5e06e2a49 --- /dev/null +++ b/components/core/src/clp_s/search/Literal.hpp @@ -0,0 +1,115 @@ +#ifndef CLP_S_SEARCH_LITERAL_HPP +#define CLP_S_SEARCH_LITERAL_HPP + +#include + +#include "FilterOperation.hpp" +#include "Value.hpp" + +namespace clp_s::search { +/** + * An enum representing all of the Literal types that can show up in the AST. + */ +enum LiteralType : uint32_t { + TypesBegin = 1, + IntegerT = 1, + FloatT = 1 << 1, + ClpStringT = 1 << 2, + VarStringT = 1 << 3, + BooleanT = 1 << 4, + ArrayT = 1 << 5, + NullT = 1 << 6, + EpochDateT = 1 << 7, + FloatDateT = 1 << 8, + TypesEnd = 1 << 9, + UnknownT = ((uint32_t)1) << 31 +}; + +typedef uint32_t LiteralTypeBitmask; + +constexpr LiteralTypeBitmask cIntegralTypes = LiteralType::IntegerT | LiteralType::FloatT; +constexpr LiteralTypeBitmask cAllTypes = TypesEnd - 1; + +/** + * Parent class for all Literals in the AST. + */ +class Literal : public Value { +public: + /** + * Literals are considered to have 1 operand. + * @return 1 + */ + unsigned get_num_operands() override { return 1; } + + /** + * Strict checks for type matching against a given literal type. + * @return true if the check succeeds + */ + virtual bool matches_type(LiteralType type) = 0; + + virtual bool matches_any(LiteralTypeBitmask mask) = 0; + + virtual bool matches_exactly(LiteralTypeBitmask mask) = 0; + + /** + * Convert LiteralType enum values to strings . Only used for printing. + * @param type the enum value being turned in a string + * @return A string representing the enum value + */ + static std::string type_to_string(LiteralType type) { + switch (type) { + case LiteralType::IntegerT: + return "int"; + case LiteralType::FloatT: + return "float"; + case LiteralType::ClpStringT: + return "clpstring"; + case LiteralType::VarStringT: + return "varstring"; + case LiteralType::BooleanT: + return "bool"; + case LiteralType::ArrayT: + return "array"; + case LiteralType::NullT: + return "null"; + case LiteralType::EpochDateT: + return "epochdate"; + case LiteralType::FloatDateT: + return "floatdate"; + default: + return "errtype"; + } + } + + /** + * Functions to check type conversion and cast when possible under a given filter operation. + * By default all casts fail until overriden by the derived literal types. + * @param ret the casted value + * @param op the FilterOperation operating on the Literal + * @return true if cast is successful + */ + virtual bool as_clp_string(std::string& ret, FilterOperation op) { return false; } + + virtual bool as_var_string(std::string& ret, FilterOperation op) { return false; } + + virtual bool as_float(double& ret, FilterOperation op) { return false; } + + virtual bool as_int(int64_t& ret, FilterOperation op) { return false; } + + virtual bool as_bool(bool& ret, FilterOperation op) { return false; } + + virtual bool as_null(FilterOperation op) { return false; } + + inline bool as_array(std::string& ret, FilterOperation op) { + return as_var_string(ret, op) || as_clp_string(ret, op); + } + + virtual bool as_epoch_date() { return false; } + + virtual bool as_float_date() { return false; } + + virtual bool as_any(FilterOperation op) { return false; } +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_LITERAL_HPP diff --git a/components/core/src/clp_s/search/NarrowTypes.cpp b/components/core/src/clp_s/search/NarrowTypes.cpp new file mode 100644 index 000000000..82b8e7e5b --- /dev/null +++ b/components/core/src/clp_s/search/NarrowTypes.cpp @@ -0,0 +1,76 @@ +#include "NarrowTypes.hpp" + +#include "ConstantProp.hpp" +#include "EmptyExpr.hpp" +#include "FilterExpr.hpp" +#include "Literal.hpp" + +namespace clp_s::search { +std::shared_ptr NarrowTypes::run(std::shared_ptr& expr) { + expr = narrow(expr); + + ConstantProp constant_prop; + return constant_prop.run(expr); +} + +std::shared_ptr NarrowTypes::narrow(std::shared_ptr cur) { + if (cur->has_only_expression_operands()) { + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + auto child = std::static_pointer_cast(*it); + auto new_child = narrow(child); + if (new_child != child) { + new_child->copy_replace(cur.get(), it); + } + } + } else if (auto filter = std::dynamic_pointer_cast(cur)) { + // TODO: will have to change if we start supporting multi column expressions + auto column = filter->get_column(); + auto op = filter->get_operation(); + + if (op == FilterOperation::EXISTS || op == FilterOperation::NEXISTS) { + return cur; + } + + auto literal = filter->get_operand(); + std::string tmpstring; + int64_t tmpint; + double tmpdouble; + bool tmpbool; + + if (false == literal->as_any(op)) { + if (false == literal->as_clp_string(tmpstring, op)) { + column->remove_matching_type(LiteralType::ClpStringT); + } + if (false == literal->as_var_string(tmpstring, op)) { + column->remove_matching_type(LiteralType::VarStringT); + } + if (false == literal->as_int(tmpint, op)) { + column->remove_matching_type(LiteralType::IntegerT); + } + if (false == literal->as_float(tmpdouble, op)) { + column->remove_matching_type(LiteralType::FloatT); + } + if (false == literal->as_bool(tmpbool, op)) { + column->remove_matching_type(LiteralType::BooleanT); + } + if (false == literal->as_array(tmpstring, op)) { + column->remove_matching_type(LiteralType::ArrayT); + } + if (false == literal->as_null(op)) { + column->remove_matching_type(LiteralType::NullT); + } + if (false == literal->as_epoch_date()) { + column->remove_matching_type(LiteralType::EpochDateT); + } + if (false == literal->as_float_date()) { + column->remove_matching_type(LiteralType::EpochDateT); + } + } + + if (false == column->matches_any(cAllTypes)) { + return EmptyExpr::create(); + } + } + return cur; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/NarrowTypes.hpp b/components/core/src/clp_s/search/NarrowTypes.hpp new file mode 100644 index 000000000..8504d6e02 --- /dev/null +++ b/components/core/src/clp_s/search/NarrowTypes.hpp @@ -0,0 +1,22 @@ +#ifndef CLP_S_SEARCH_NARROWTYPES_HPP +#define CLP_S_SEARCH_NARROWTYPES_HPP + +#include "Transformation.hpp" + +namespace clp_s::search { +class NarrowTypes : public Transformation { +public: + // Methods inherited from Transformation + std::shared_ptr run(std::shared_ptr& expr) override; + +private: + /** + * Narrow the type of an expression + * @param cur the expression to narrow + * @return the narrowed expression + */ + static std::shared_ptr narrow(std::shared_ptr cur); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_NARROWTYPES_HPP diff --git a/components/core/src/clp_s/search/NullLiteral.cpp b/components/core/src/clp_s/search/NullLiteral.cpp new file mode 100644 index 000000000..9a7b51ffc --- /dev/null +++ b/components/core/src/clp_s/search/NullLiteral.cpp @@ -0,0 +1,32 @@ +#include "NullLiteral.hpp" + +namespace clp_s::search { +std::shared_ptr NullLiteral::create() { + return std::shared_ptr(new NullLiteral()); +} + +std::shared_ptr NullLiteral::create_from_string(std::string const& v) { + if (v == "null") { + return std::shared_ptr(new NullLiteral()); + } + + return {nullptr}; +} + +void NullLiteral::print() { + get_print_stream() << "null"; +} + +bool NullLiteral::as_var_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::EQ || op == FilterOperation::NEQ) { + ret = "null"; + return true; + } + + return false; +} + +bool NullLiteral::as_null(FilterOperation op) { + return op == FilterOperation::EQ || op == FilterOperation::NEQ; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/NullLiteral.hpp b/components/core/src/clp_s/search/NullLiteral.hpp new file mode 100644 index 000000000..072529e48 --- /dev/null +++ b/components/core/src/clp_s/search/NullLiteral.hpp @@ -0,0 +1,54 @@ +#ifndef CLP_S_SEARCH_NULLLITERAL_HPP +#define CLP_S_SEARCH_NULLLITERAL_HPP + +#include +#include +#include + +#include "Literal.hpp" + +namespace clp_s::search { +/** + * Class for Null literals in the search AST + */ +class NullLiteral : public Literal { +public: + // Deleted copy + NullLiteral(NullLiteral const&) = delete; + + NullLiteral& operator=(NullLiteral const&) = delete; + + /** + * Explicit create a null literal + * @return A newly created null literal + */ + static std::shared_ptr create(); + + /** + * Try to create a null literal from a string + * @param v the string we are attempting to convert to Null + * @return A null literal, or nullptr if the string does not represent "null" + */ + static std::shared_ptr create_from_string(std::string const& v); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + bool matches_type(LiteralType type) override { return type & LiteralType::NullT; } + + bool matches_any(LiteralTypeBitmask mask) override { return mask & LiteralType::NullT; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return mask == LiteralType::NullT; } + + bool as_var_string(std::string& ret, FilterOperation op) override; + + bool as_null(FilterOperation op) override; + +private: + // Constructor + NullLiteral() = default; +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_NULLLITERAL_HPP diff --git a/components/core/src/clp_s/search/OrExpr.cpp b/components/core/src/clp_s/search/OrExpr.cpp new file mode 100644 index 000000000..e327710c8 --- /dev/null +++ b/components/core/src/clp_s/search/OrExpr.cpp @@ -0,0 +1,55 @@ +#include "OrExpr.hpp" + +namespace clp_s::search { +OrExpr::OrExpr(bool inverted, Expression* parent) : Expression(inverted, parent) {} + +OrExpr::OrExpr(OrExpr const& expr) : Expression(expr) {} + +void OrExpr::print() { + auto& os = get_print_stream(); + if (is_inverted()) { + os << "!"; + } + + os << "OrExpr("; + for (auto it = op_begin(); it != op_end();) { + (*it)->print(); + it++; + if (it != op_end()) { + os << ", "; + } + } + os << ")"; + + if (get_parent() == nullptr) { + os << std::endl; + } else { + os << std::flush; + } +} + +std::shared_ptr OrExpr::copy() const { + auto new_expr = std::shared_ptr(new OrExpr(*this)); + for (auto it = new_expr->op_begin(); it != new_expr->op_end(); it++) { + auto expr = std::static_pointer_cast(*it); + expr->copy_replace(new_expr.get(), it); + } + return new_expr; +} + +std::shared_ptr OrExpr::create(bool inverted, Expression* parent) { + return std::shared_ptr(static_cast(new OrExpr(inverted, parent))); +} + +std::shared_ptr OrExpr::create( + std::shared_ptr& op1, + std::shared_ptr& op2, + bool inverted, + Expression* parent +) { + std::shared_ptr expr(static_cast(new OrExpr(inverted, parent))); + op1->copy_append(expr.get()); + op2->copy_append(expr.get()); + return expr; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/OrExpr.hpp b/components/core/src/clp_s/search/OrExpr.hpp new file mode 100644 index 000000000..8e95cf24f --- /dev/null +++ b/components/core/src/clp_s/search/OrExpr.hpp @@ -0,0 +1,53 @@ +#ifndef CLP_S_SEARCH_OREXPR_HPP +#define CLP_S_SEARCH_OREXPR_HPP + +#include "Expression.hpp" + +namespace clp_s::search { +/** + * Class representing a logical Or operation across all + * children in its OpList. Can have arbitrarily many children. + */ +class OrExpr : public Expression { +public: + /** + * Create an empty Or expression which can optionally be inverted and attached to a parent + * Children can be added via mutators inherited from Expression. + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return A newly created Or expression + */ + static std::shared_ptr create(bool inverted = false, Expression* parent = nullptr); + + /** + * Create an Or expression with two children + * @param op1 the first child operand + * @param op2 the second child operand + * @param inverted expression is inverted when true + * @param parent parent this expression is attached to + * @return A newly created Or expression + */ + static std::shared_ptr create( + std::shared_ptr& op1, + std::shared_ptr& op2, + bool inverted = false, + Expression* parent = nullptr + ); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Expression + bool has_only_expression_operands() override { return true; } + + std::shared_ptr copy() const override; + +private: + // Constructor + explicit OrExpr(bool inverted = false, Expression* parent = nullptr); + + OrExpr(OrExpr const&); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_OREXPR_HPP diff --git a/components/core/src/clp_s/search/OrOfAndForm.cpp b/components/core/src/clp_s/search/OrOfAndForm.cpp new file mode 100644 index 000000000..7a9ae906f --- /dev/null +++ b/components/core/src/clp_s/search/OrOfAndForm.cpp @@ -0,0 +1,179 @@ +#include "OrOfAndForm.hpp" + +#include + +#include "SearchUtils.hpp" + +namespace clp_s::search { +std::shared_ptr OrOfAndForm::run(std::shared_ptr& expr) { + auto parent = expr->get_parent(); + while (expr->get_num_operands() == 1 && expr->has_only_expression_operands()) { + bool invert = expr->is_inverted(); + expr = std::static_pointer_cast(*expr->op_begin()); + expr->set_parent(parent); + if (invert) { + expr->invert(); + } + } + + if (expr->is_inverted()) { + de_morgan(expr); + } + + // only need to further simplify and/or expressions + if (false == expr->has_only_expression_operands()) { + return expr; + } + + return simplify(expr); +} + +void OrOfAndForm::de_morgan(std::shared_ptr& expr) { + std::shared_ptr new_expr; + + if (std::dynamic_pointer_cast(expr)) { + new_expr = OrExpr::create(!expr->is_inverted(), expr->get_parent()); + } else if (std::dynamic_pointer_cast(expr)) { + new_expr = AndExpr::create(!expr->is_inverted(), expr->get_parent()); + } else { + // DeMorgan's doesn't apply; no modification required + return; + } + + new_expr->get_op_list().splice(new_expr->op_end(), expr->get_op_list()); + for (auto it = new_expr->op_begin(); it != new_expr->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + sub_expr->set_parent(new_expr.get()); + sub_expr->invert(); + } + + expr = new_expr; +} + +std::shared_ptr OrOfAndForm::simplify(std::shared_ptr const& expr) { + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + if (sub_expr->is_inverted()) { + // DeMorgan's already makes checks that input is Or or And so don't + // need to double check here + de_morgan(sub_expr); + *it = sub_expr; + } + + while (sub_expr->get_num_operands() == 1 && sub_expr->has_only_expression_operands()) { + bool invert = sub_expr->is_inverted(); + sub_expr = std::static_pointer_cast(*sub_expr->op_begin()); + sub_expr->set_parent(expr.get()); + *it = sub_expr; + if (invert) { + sub_expr->invert(); + } + } + + // Only need to simplify Or/And subexpr + if (sub_expr->has_only_expression_operands()) { + *it = simplify(sub_expr); + } + } + + if (std::dynamic_pointer_cast(expr)) { + return simplify_or(expr); + } else if (std::dynamic_pointer_cast(expr)) { + return simplify_and(expr); + } else { + // currently and/or are the only form of expressions we need to simplify + return expr; + } +} + +std::shared_ptr OrOfAndForm::simplify_or(std::shared_ptr const& expr) { + std::vector deleted; + + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + if (std::dynamic_pointer_cast(*it)) { + auto sub_expr = std::static_pointer_cast(*it); + deleted.push_back(it); + splice_into(expr, sub_expr, expr->op_begin()); + } + } + + for (auto const& it : deleted) { + expr->get_op_list().erase(it); + } + + return expr; +} + +std::shared_ptr OrOfAndForm::simplify_and(std::shared_ptr const& expr) { + std::vector deleted; + std::vector deleted_or_expr; + std::vector> or_expressions; + + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + if (std::dynamic_pointer_cast(*it)) { + auto sub_expr = std::static_pointer_cast(*it); + deleted.push_back(it); + splice_into(expr, sub_expr, expr->op_begin()); + } else if (std::dynamic_pointer_cast(*it)) { + deleted_or_expr.push_back(it); + } + } + + for (auto const& it : deleted) { + expr->get_op_list().erase(it); + } + + if (deleted_or_expr.empty()) { + return expr; + } + + for (auto const& it : deleted_or_expr) { + or_expressions.push_back(std::static_pointer_cast(*it)); + expr->get_op_list().erase(it); + } + + auto new_or_expr = OrExpr::create(false, expr->get_parent()); + ExpressionList prefix; + insert_all_combinations( + new_or_expr, + expr, + or_expressions.begin(), + or_expressions.end(), + prefix + ); + + return new_or_expr; +} + +void OrOfAndForm::insert_all_combinations( + std::shared_ptr const& new_or_expr, + std::shared_ptr const& base_and_expr, + ExpressionVector::iterator cur, + ExpressionVector::iterator end, + ExpressionList& prefix +) { + if (cur == end) { + auto new_and_expr = base_and_expr->copy(); + for (auto const& it : prefix) { + // these OrExpr are guaranteed to contain only FilterExpr/AndExpr + if (std::dynamic_pointer_cast(it)) { + splice_into(new_and_expr, it->copy(), new_and_expr->op_end()); + } else { + it->copy_append(new_and_expr.get()); + } + } + new_or_expr->add_operand(new_and_expr); + return; + } + + auto current_or = *cur; + cur++; + for (auto it = current_or->op_begin(); it != current_or->op_end(); it++) { + prefix.push_back(std::static_pointer_cast(*it)); + auto cur_copy = cur; + cur_copy++; + insert_all_combinations(new_or_expr, base_and_expr, cur, end, prefix); + prefix.pop_back(); + } +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/OrOfAndForm.hpp b/components/core/src/clp_s/search/OrOfAndForm.hpp new file mode 100644 index 000000000..7a400eb3f --- /dev/null +++ b/components/core/src/clp_s/search/OrOfAndForm.hpp @@ -0,0 +1,66 @@ +#ifndef CLP_S_SEARCH_OROFANDFORM_HPP +#define CLP_S_SEARCH_OROFANDFORM_HPP + +#include + +#include "AndExpr.hpp" +#include "OrExpr.hpp" +#include "Transformation.hpp" + +namespace clp_s::search { +typedef std::vector> ExpressionVector; +typedef std::list> ExpressionList; + +// TODO: handle degenerate forms like empty or/and expressions +class OrOfAndForm : public Transformation { +public: + // Methods inherited from Transformation + std::shared_ptr run(std::shared_ptr& expr) override; + +private: + /** + * Use De Morgan's laws to convert the expression to a canonical form + * @param expr + */ + static void de_morgan(std::shared_ptr& expr); + + /** + * Simplify an expression + * @param expr + * @return The simplified expression + */ + static std::shared_ptr simplify(std::shared_ptr const& expr); + + /** + * Simplify an Or expression + * @param expr + * @return The simplified expression + */ + static std::shared_ptr simplify_or(std::shared_ptr const& expr); + + /** + * Simplify an And expression + * @param expr + * @return The simplified expression + */ + static std::shared_ptr simplify_and(std::shared_ptr const& expr); + + /** + * Insert all combinations of And expressions into an Or expression + * @param new_or_expr + * @param base_and_expr + * @param cur + * @param end + * @param prefix + */ + static void insert_all_combinations( + std::shared_ptr const& new_or_expr, + std::shared_ptr const& base_and_expr, + ExpressionVector::iterator cur, + ExpressionVector::iterator end, + ExpressionList& prefix + ); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_OROFANDFORM_HPP diff --git a/components/core/src/clp_s/search/Output.cpp b/components/core/src/clp_s/search/Output.cpp new file mode 100644 index 000000000..37c89eccd --- /dev/null +++ b/components/core/src/clp_s/search/Output.cpp @@ -0,0 +1,1182 @@ +#include "Output.hpp" + +#include +#include + +#include + +#include "../FileWriter.hpp" +#include "../ReaderUtils.hpp" +#include "../Utils.hpp" +#include "AndExpr.hpp" +#include "clp_search/EncodedVariableInterpreter.hpp" +#include "clp_search/Grep.hpp" +#include "EvaluateTimestampIndex.hpp" +#include "FilterExpr.hpp" +#include "OrExpr.hpp" +#include "SearchUtils.hpp" + +using json = nlohmann::json; + +#define eval(op, a, b) (((op) == FilterOperation::EQ) ? ((a) == (b)) : ((a) != (b))) + +namespace clp_s::search { +void Output::filter() { + auto top_level_expr = m_expr; + + for (auto const& archive : ReaderUtils::get_archives(m_archives_dir)) { + std::vector matched_schemas; + bool has_array = false; + bool has_array_search = false; + for (int32_t schema_id : ReaderUtils::get_schemas(archive)) { + if (m_match.schema_matched(schema_id)) { + matched_schemas.push_back(schema_id); + if (m_match.has_array(schema_id)) { + has_array = true; + } + if (m_match.has_array_search(schema_id)) { + has_array_search = true; + } + } + } + + // Skip decompressing segment if it contains no + // relevant schemas + if (matched_schemas.empty()) { + continue; + } + + // Skip decompressing sub-archive if it won't match based on the timestamp + // range index + EvaluateTimestampIndex timestamp_index(ReaderUtils::read_local_timestamp_dictionary(archive) + ); + if (timestamp_index.run(top_level_expr) == EvaluatedValue::False) { + continue; + } + + m_var_dict = ReaderUtils::get_variable_dictionary_reader(archive); + m_log_dict = ReaderUtils::get_log_type_dictionary_reader(archive); + // array_dict_ = GetArrayDictionaryReader(archive); + m_var_dict->read_new_entries(); + m_log_dict->read_new_entries(); + + if (has_array) { + m_array_dict = ReaderUtils::get_array_dictionary_reader(archive); + if (has_array_search) { + m_array_dict->read_new_entries(); + } else { + m_array_dict->read_new_entries(true); + } + } + + m_string_query_map.clear(); + m_string_var_match_map.clear(); + populate_string_queries(top_level_expr); + + std::string message; + for (int32_t schema_id : matched_schemas) { + m_expr_clp_query.clear(); + m_expr_var_match_map.clear(); + m_expr = m_match.get_query_for_schema(schema_id)->copy(); + m_wildcard_to_searched_columns.clear(); + m_wildcard_to_searched_clpstrings.clear(); + m_wildcard_to_searched_varstrings.clear(); + m_wildcard_to_searched_datestrings.clear(); + m_wildcard_to_searched_floatdatestrings.clear(); + m_schema = schema_id; + + populate_searched_wildcard_columns(m_expr); + + m_expression_value = constant_propagate(m_expr, schema_id); + + if (m_expression_value == EvaluatedValue::False) { + continue; + } + + add_wildcard_columns_to_searched_columns(); + + SchemaReader reader(m_schema_tree, schema_id); + reader.open(archive + "/encoded_messages/" + std::to_string(schema_id)); + ReaderUtils::append_reader_columns( + &reader, + (*m_schemas)[schema_id], + m_schema_tree, + m_var_dict, + m_log_dict, + m_array_dict, + m_timestamp_dict + ); + reader.load(); + + reader.initialize_filter(this); + while (reader.get_next_message(message, this)) { + write(STDOUT_FILENO, message.c_str(), message.length()); + } + reader.close(); + } + + m_var_dict->close(); + m_log_dict->close(); + + if (has_array) { + m_array_dict->close(); + } + } +} + +void Output::init( + SchemaReader* reader, + int32_t schema_id, + std::unordered_map& columns +) { + m_reader = reader; + m_schema = schema_id; + + m_searched_columns.clear(); + m_other_columns.clear(); + + for (auto& column : columns) { + ClpStringColumnReader* clp_reader = dynamic_cast(column.second); + VariableStringColumnReader* var_reader + = dynamic_cast(column.second); + if (m_match.schema_searches_against_column(schema_id, column.first)) { + if (clp_reader != nullptr && clp_reader->get_type() == "string") { + m_clp_string_readers[column.first] = clp_reader; + m_other_columns.push_back(column.second); + } else if (var_reader != nullptr && var_reader->get_type() == "string") { + m_var_string_readers[column.first] = var_reader; + m_other_columns.push_back(column.second); + } else if (auto date_column_reader = dynamic_cast(column.second)) + { + m_datestring_readers[column.first] = date_column_reader; + m_other_columns.push_back(column.second); + } else if (auto float_date_column_reader = dynamic_cast(column.second)) + { + m_floatdatestring_readers[column.first] = float_date_column_reader; + m_other_columns.push_back(column.second); + } else { + m_searched_columns.push_back(column.second); + } + } else { + m_other_columns.push_back(column.second); + } + } +} + +bool Output::filter( + uint64_t cur_message, + std::map>& extracted_values +) { + m_cur_message = cur_message; + m_cached_string_columns.clear(); + for (auto* column : m_searched_columns) { + extracted_values[column->get_id()] = column->extract_value(cur_message); + } + + // filter + if (false == evaluate(m_expr.get(), m_schema, extracted_values)) { + return false; + } + + for (auto* column : m_other_columns) { + if (m_cached_string_columns.find(column->get_id()) == m_cached_string_columns.end()) { + extracted_values[column->get_id()] = column->extract_value(cur_message); + } + } + + return true; +} + +enum CurExpr { + AND, + OR, + FILTER +}; + +bool Output::evaluate( + Expression* expr, + int32_t schema, + std::map>& extracted_values +) { + if (m_expression_value == EvaluatedValue::True) { + return true; + } + + std::stack> parent_type; + std::stack> parent_it; + + Expression* cur = expr; + CurExpr cur_type = CurExpr::FILTER; + bool ret = false; + + if (dynamic_cast(cur)) { + cur_type = CurExpr::AND; + parent_type.push(CurExpr::AND); + parent_it.push(cur->op_begin()); + ret = true; + } else if (dynamic_cast(cur)) { + cur_type = CurExpr::OR; + parent_type.push(CurExpr::OR); + parent_it.push(cur->op_begin()); + ret = false; + } + + do { + switch (cur_type) { + case CurExpr::AND: + if (false == ret || parent_it.top() == cur->op_end()) { + parent_type.pop(); + parent_it.pop(); + break; + } else { + cur = static_cast((parent_it.top()++)->get()); + if (dynamic_cast(cur)) { + cur_type = CurExpr::FILTER; + } else { + // must be an OR-expr because AST would have been simplified + // to eliminate nested AND + cur_type = CurExpr::OR; + parent_type.push(CurExpr::OR); + parent_it.push(cur->op_begin()); + ret = false; + } + continue; + } + case CurExpr::FILTER: + if (static_cast(cur)->get_column()->is_pure_wildcard()) { + ret = evaluate_wildcard_filter( + static_cast(cur), + schema, + extracted_values + ); + } else { + ret = evaluate_filter(static_cast(cur), schema, extracted_values); + } + break; + case CurExpr::OR: + if (ret || parent_it.top() == cur->op_end()) { + parent_type.pop(); + parent_it.pop(); + break; + } else { + cur = static_cast((parent_it.top()++)->get()); + if (dynamic_cast(cur)) { + cur_type = CurExpr::FILTER; + } else { + // must be an AND-expr because AST would have been simplified + // to eliminate nested OR + cur_type = CurExpr::AND; + parent_type.push(CurExpr::AND); + parent_it.push(cur->op_begin()); + ret = true; + } + continue; + } + } + + ret = cur->is_inverted() ? !ret : ret; + if (false == parent_type.empty()) { + cur_type = parent_type.top(); + } + cur = cur->get_parent(); + } while (cur != nullptr); + + return ret; +} + +bool Output::evaluate_wildcard_filter( + FilterExpr* expr, + int32_t schema, + std::map>& extracted_values +) { + auto literal = expr->get_operand(); + auto* column = expr->get_column().get(); + Query* q = m_expr_clp_query[expr]; + std::unordered_set* matching_vars = m_expr_var_match_map[expr]; + auto op = expr->get_operation(); + for (int32_t column_id : m_wildcard_to_searched_clpstrings[column]) { + if (evaluate_clp_string_filter(op, q, column_id, literal, extracted_values)) { + return true; + } + } + + for (int32_t column_id : m_wildcard_to_searched_varstrings[column]) { + if (evaluate_var_string_filter(op, m_var_string_readers[column_id], matching_vars, literal)) + { + return true; + } + } + + for (int32_t column_id : m_wildcard_to_searched_datestrings[column]) { + if (evaluate_epoch_date_filter(op, m_datestring_readers[column_id], literal)) { + return true; + } + } + + for (int32_t column_id : m_wildcard_to_searched_floatdatestrings[column]) { + if (evaluate_float_date_filter(op, m_floatdatestring_readers[column_id], literal)) { + return true; + } + } + + m_maybe_number = expr->get_column()->matches_type(LiteralType::FloatT); + for (int32_t column_id : m_wildcard_to_searched_columns[column]) { + bool ret = false; + switch (node_to_literal_type(m_schema_tree->get_node(column_id)->get_type())) { + case LiteralType::IntegerT: + ret = evaluate_int_filter( + op, + std::get(extracted_values[column_id]), + literal + ); + break; + case LiteralType::FloatT: + ret = evaluate_float_filter( + op, + std::get(extracted_values[column_id]), + literal + ); + break; + case LiteralType::BooleanT: + ret = evaluate_bool_filter( + op, + std::get(extracted_values[column_id]), + literal + ); + break; + case LiteralType::ArrayT: + ret = evaluate_wildcard_array_filter( + op, + std::get(extracted_values[column_id]), + literal + ); + break; + } + + if (ret) { + return true; + } + } + + return false; +} + +bool Output::evaluate_filter( + FilterExpr* expr, + int32_t schema, + std::map>& extracted_values +) { + auto column = expr->get_column().get(); + int32_t column_id = column->get_column_id(); + auto literal = expr->get_operand(); + Query* q = nullptr; + ClpStringColumnReader* clp_reader = nullptr; + VariableStringColumnReader* var_reader = nullptr; + std::unordered_set* matching_vars = nullptr; + switch (column->get_literal_type()) { + case LiteralType::IntegerT: + return evaluate_int_filter( + expr->get_operation(), + std::get(extracted_values[column_id]), + literal + ); + case LiteralType::FloatT: + return evaluate_float_filter( + expr->get_operation(), + std::get(extracted_values[column_id]), + literal + ); + case LiteralType::ClpStringT: + q = m_expr_clp_query[expr]; + clp_reader = m_clp_string_readers[column_id]; + return evaluate_clp_string_filter( + expr->get_operation(), + q, + column_id, + literal, + extracted_values + ); + case LiteralType::VarStringT: + var_reader = m_var_string_readers[column_id]; + matching_vars = m_expr_var_match_map.at(expr); + return evaluate_var_string_filter( + expr->get_operation(), + var_reader, + matching_vars, + literal + ); + case LiteralType::BooleanT: + return evaluate_bool_filter( + expr->get_operation(), + std::get(extracted_values[column_id]), + literal + ); + case LiteralType::ArrayT: + return evaluate_array_filter( + expr->get_operation(), + column->get_unresolved_tokens(), + std::get(extracted_values[column_id]), + literal + ); + case LiteralType::EpochDateT: + return evaluate_epoch_date_filter( + expr->get_operation(), + m_datestring_readers[column_id], + literal + ); + case LiteralType::FloatDateT: + return evaluate_float_date_filter( + expr->get_operation(), + m_floatdatestring_readers[column_id], + literal + ); + // case LiteralType::NullT: + // null checks are always turned into existence operators -- + // no need to evaluate here + default: + return false; + } +} + +bool Output::evaluate_int_filter( + FilterOperation op, + int64_t value, + std::shared_ptr const& operand +) { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + int64_t op_value; + if (false == operand->as_int(op_value, op)) { + return false; + } + + switch (op) { + case FilterOperation::EQ: + return value == op_value; + case FilterOperation::NEQ: + return value != op_value; + case FilterOperation::LT: + return value < op_value; + case FilterOperation::GT: + return value > op_value; + case FilterOperation::LTE: + return value <= op_value; + case FilterOperation::GTE: + return value >= op_value; + default: + return false; + } +} + +bool Output::evaluate_float_filter( + FilterOperation op, + double value, + std::shared_ptr const& operand +) { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + double op_value; + if (false == operand->as_float(op_value, op)) { + return false; + } + + switch (op) { + case FilterOperation::EQ: + return value == op_value; + case FilterOperation::NEQ: + return value != op_value; + case FilterOperation::LT: + return value < op_value; + case FilterOperation::GT: + return value > op_value; + case FilterOperation::LTE: + return value <= op_value; + case FilterOperation::GTE: + return value >= op_value; + default: + return false; + } +} + +bool Output::evaluate_clp_string_filter( + FilterOperation op, + Query* q, + int32_t column_id, + std::shared_ptr const& operand, + std::map>& extracted_values +) { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + if (op != FilterOperation::EQ && op != FilterOperation::NEQ) { + return false; + } + + auto* reader = m_clp_string_readers[column_id]; + int64_t id = reader->get_encoded_id(m_cur_message); + bool matched = false; + + if (q->search_string_matches_all()) { + return op == FilterOperation::EQ; + } + + auto vars = reader->get_encoded_vars(m_cur_message); + for (auto const& subquery : q->get_sub_queries()) { + if (subquery.matches_logtype(id) && subquery.matches_vars(vars)) { + matched = true; + + if (subquery.wildcard_match_required()) { + std::string decompressed_message + = std::get(reader->extract_value(m_cur_message)); + matched = StringUtils::wildcard_match_unsafe( + decompressed_message, + q->get_search_string(), + !q->get_ignore_case() + ); + matched = (op == FilterOperation::EQ) == matched; + if (matched) { + extracted_values[column_id] = std::move(decompressed_message); + m_cached_string_columns.insert(column_id); + } + return matched; + } + + break; + } + } + + return (op == FilterOperation::EQ) == matched; +} + +bool Output::evaluate_var_string_filter( + FilterOperation op, + VariableStringColumnReader* reader, + std::unordered_set* matching_vars, + std::shared_ptr const& operand +) const { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + int64_t id = reader->get_variable_id(m_cur_message); + bool matched = matching_vars->count(id); + switch (op) { + case FilterOperation::EQ: + return matched; + case FilterOperation::NEQ: + return !matched; + default: + return false; + } +} + +bool Output::evaluate_array_filter( + FilterOperation op, + DescriptorList const& unresolved_tokens, + std::string const& value, + std::shared_ptr const& operand +) const { + auto object = json::parse(value); + return evaluate_array_filter(object, op, unresolved_tokens, 0, operand, true); +} + +bool Output::evaluate_array_filter( + json& object, + FilterOperation op, + DescriptorList const& unresolved_tokens, + size_t cur_idx, + std::shared_ptr const& operand, + bool array_or_object +) const { + bool match = false; + if (cur_idx > unresolved_tokens.size()) { + return false; + } + + for (auto i = object.begin(); i != object.end(); ++i) { + auto& value = i.value(); + if (value.is_array()) { + match |= evaluate_array_filter(value, op, unresolved_tokens, cur_idx, operand, true); + } else if (value.is_object()) { + if (false == array_or_object && cur_idx < unresolved_tokens.size() + && i.key() == unresolved_tokens[cur_idx].get_token()) + { + match |= evaluate_array_filter( + value, + op, + unresolved_tokens, + cur_idx + 1, + operand, + false + ); + } else if (array_or_object) { + match |= evaluate_array_filter( + value, + op, + unresolved_tokens, + cur_idx, + operand, + false + ); + } + } else if (((array_or_object && cur_idx == unresolved_tokens.size()) + || (!array_or_object && cur_idx == unresolved_tokens.size() - 1 + && i.key() == unresolved_tokens[cur_idx].get_token()))) + { + std::string tmp_string; + int64_t tmp_int; + double tmp_float; + bool tmp_bool; + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op + || (value.is_number_integer() && operand->as_int(tmp_int, op) + && eval(op, value.get(), tmp_int)) + || (value.is_number_float() && operand->as_float(tmp_float, op) + && eval(op, value.get(), tmp_float)) + || (value.is_boolean() && operand->as_bool(tmp_bool, op) + && eval(op, value.get(), tmp_bool))) + { + match = true; + } else if (value.is_string() && (operand->as_var_string(tmp_string, op) || operand->as_clp_string(tmp_string, op))) + { + std::string s = value.get(); + match = wildcard_match(s, tmp_string) ? op == FilterOperation::EQ + : op == FilterOperation::NEQ; + } + } + + if (match) { + return true; + } + } + + return match; +} + +bool Output::evaluate_wildcard_array_filter( + FilterOperation op, + std::string& value, + std::shared_ptr const& operand +) { + if (value.capacity() < (value.size() + simdjson::SIMDJSON_PADDING)) { + value.reserve(value.size() + simdjson::SIMDJSON_PADDING); + } + auto obj = m_array_parser.iterate(value); + ondemand::array array = obj.get_array(); + + // pre-evaluate whether we can match strings or numbers to eliminate + // duplicate effort on every item + m_maybe_string = operand->as_var_string(m_array_search_string, op) + || operand->as_clp_string(m_array_search_string, op); + + return evaluate_wildcard_array_filter(array, op, operand); +} + +bool Output::evaluate_wildcard_array_filter( + ondemand::array& array, + FilterOperation op, + std::shared_ptr const& operand +) const { + bool match = false; + for (auto item : array) { + switch (item.type()) { + case ondemand::json_type::object: { + ondemand::object nested_object = item.get_object(); + if (evaluate_wildcard_array_filter(nested_object, op, operand)) { + match = true; + } + } break; + case ondemand::json_type::array: { + ondemand::array nested_array = item.get_array(); + if (evaluate_wildcard_array_filter(nested_array, op, operand)) { + match = true; + } + } break; + case ondemand::json_type::string: { + if (false == m_maybe_string) { + break; + } + if (wildcard_match(item.get_string().value(), m_array_search_string)) { + match |= op == FilterOperation::EQ; + } + break; + } break; + case ondemand::json_type::number: { + if (false == m_maybe_number) { + break; + } + ondemand::number number = item.get_number(); + if (number.is_double()) { + double tmp_double; + operand->as_float(tmp_double, op); + match |= eval(op, number.get_double(), tmp_double); + } else if (number.is_uint64()) { + int64_t tmp_int; + operand->as_int(tmp_int, op); + match |= eval(op, number.get_uint64(), tmp_int); + } else { + int64_t tmp_int; + operand->as_int(tmp_int, op); + match |= eval(op, number.get_int64(), tmp_int); + } + } break; + case ondemand::json_type::boolean: { + bool tmp; + if (operand->as_bool(tmp, op) && eval(op, item.get_bool(), tmp)) { + match = true; + } + } break; + case ondemand::json_type::null: + if (operand->as_null(op)) { + match |= op == FilterOperation::EQ; + } + break; + } + + if (match) { + return true; + } + } + return false; +} + +bool Output::evaluate_wildcard_array_filter( + ondemand::object& object, + FilterOperation op, + std::shared_ptr const& operand +) const { + bool match = false; + for (auto field : object) { + ondemand::value item = field.value(); + switch (item.type()) { + case ondemand::json_type::object: { + ondemand::object nested_object = item.get_object(); + if (evaluate_wildcard_array_filter(nested_object, op, operand)) { + match = true; + } + } break; + case ondemand::json_type::array: { + ondemand::array nested_array = item.get_array(); + if (evaluate_wildcard_array_filter(nested_array, op, operand)) { + match = true; + } + } break; + case ondemand::json_type::string: { + if (false == m_maybe_string) { + break; + } + if (wildcard_match(item.get_string().value(), m_array_search_string)) { + match |= op == FilterOperation::EQ; + } + break; + } break; + case ondemand::json_type::number: { + if (false == m_maybe_number) { + break; + } + ondemand::number number = item.get_number(); + if (number.is_double()) { + double tmp_double; + operand->as_float(tmp_double, op); + match |= eval(op, number.get_double(), tmp_double); + } else if (number.is_uint64()) { + int64_t tmp_int; + operand->as_int(tmp_int, op); + match |= eval(op, number.get_uint64(), tmp_int); + } else { + int64_t tmp_int; + operand->as_int(tmp_int, op); + match |= eval(op, number.get_int64(), tmp_int); + } + } break; + case ondemand::json_type::boolean: { + bool tmp; + if (operand->as_bool(tmp, op) && eval(op, item.get_bool(), tmp)) { + match = true; + } + } break; + case ondemand::json_type::null: + if (operand->as_null(op)) { + match |= op == FilterOperation::EQ; + } + break; + } + + if (match) { + return true; + } + } + return false; +} + +bool Output::evaluate_bool_filter( + FilterOperation op, + bool value, + std::shared_ptr const& operand +) { + if (FilterOperation::EXISTS == op || FilterOperation::NEXISTS == op) { + return true; + } + + bool op_value; + if (false == operand->as_bool(op_value, op)) { + return false; + } + + switch (op) { + case FilterOperation::EQ: + return value == op_value; + case FilterOperation::NEQ: + return value != op_value; + default: + return false; + } +} + +void Output::populate_string_queries(std::shared_ptr const& expr) { + if (expr->has_only_expression_operands()) { + for (auto const& op : expr->get_op_list()) { + populate_string_queries(std::static_pointer_cast(op)); + } + return; + } + + auto filter = std::dynamic_pointer_cast(expr); + if (filter != nullptr + && !(filter->get_operation() == FilterOperation::EXISTS + || filter->get_operation() == FilterOperation::NEXISTS)) + { + if (filter->get_column()->matches_type(LiteralType::ClpStringT)) { + std::string query_string; + filter->get_operand()->as_clp_string(query_string, filter->get_operation()); + + if (m_string_query_map.count(query_string)) { + return; + } + + // search on log type dictionary + Query& q = m_string_query_map[query_string]; + if (query_string.find("*") != std::string::npos + || filter->get_column()->matches_type(LiteralType::VarStringT)) + { + // if it matches VarStringT then it contains no space, so we + // don't't add more wildcards. Likewise if it already contains some wildcards + // we do not add more + Grep::process_raw_query(m_log_dict, m_var_dict, query_string, false, q, false); + } else { + Grep::process_raw_query(m_log_dict, m_var_dict, query_string, false, q); + } + } + SubQuery sub_query; + if (filter->get_column()->matches_type(LiteralType::VarStringT)) { + std::string query_string; + filter->get_operand()->as_var_string(query_string, filter->get_operation()); + if (m_string_var_match_map.count(query_string)) { + return; + } + + std::unordered_set& matching_vars = m_string_var_match_map[query_string]; + if (query_string.find('*') == std::string::npos) { + auto entry = m_var_dict->get_entry_matching_value(query_string, false); + + if (entry != nullptr) { + matching_vars.insert(entry->get_id()); + } + } else if (EncodedVariableInterpreter:: + wildcard_search_dictionary_and_get_encoded_matches( + query_string, + *m_var_dict, + false, + sub_query + )) + { + for (auto& var : sub_query.get_vars()) { + if (var.is_precise_var()) { + auto entry = var.get_var_dict_entry(); + if (entry != nullptr) { + matching_vars.insert(entry->get_id()); + } + } else { + for (auto entry : var.get_possible_var_dict_entries()) { + matching_vars.insert(entry->get_id()); + } + } + } + } + } + } +} + +void Output::populate_searched_wildcard_columns(std::shared_ptr const& expr) { + if (expr->has_only_expression_operands()) { + for (auto const& op : expr->get_op_list()) { + populate_searched_wildcard_columns(std::static_pointer_cast(op)); + } + } else if (auto filter = std::dynamic_pointer_cast(expr)) { + auto col = filter->get_column().get(); + if (false == col->is_pure_wildcard()) { + return; + } + for (int32_t node : (*m_schemas)[m_schema]) { + auto tree_node_type = m_schema_tree->get_node(node)->get_type(); + if (col->matches_type(node_to_literal_type(tree_node_type))) { + if (tree_node_type == NodeType::CLPSTRING) { + m_wildcard_to_searched_clpstrings[col].push_back(node); + } else if (tree_node_type == NodeType::VARSTRING) { + m_wildcard_to_searched_varstrings[col].push_back(node); + } else if (tree_node_type == NodeType::DATESTRING) { + m_wildcard_to_searched_datestrings[col].push_back(node); + } else if (tree_node_type == NodeType::FLOATDATESTRING) { + m_wildcard_to_searched_floatdatestrings[col].push_back(node); + } else { + // Arrays and basic types + m_wildcard_to_searched_columns[col].push_back(node); + } + } + } + } +} + +void Output::add_wildcard_columns_to_searched_columns() { + for (auto& e : m_wildcard_to_searched_clpstrings) { + for (int32_t node : e.second) { + m_match.add_searched_column_to_schema(m_schema, node); + } + } + + for (auto& e : m_wildcard_to_searched_varstrings) { + for (int32_t node : e.second) { + m_match.add_searched_column_to_schema(m_schema, node); + } + } + + for (auto& e : m_wildcard_to_searched_datestrings) { + for (int32_t node : e.second) { + m_match.add_searched_column_to_schema(m_schema, node); + } + } + + for (auto& e : m_wildcard_to_searched_floatdatestrings) { + for (int32_t node : e.second) { + m_match.add_searched_column_to_schema(m_schema, node); + } + } + + for (auto& e : m_wildcard_to_searched_columns) { + for (int32_t node : e.second) { + m_match.add_searched_column_to_schema(m_schema, node); + } + } +} + +EvaluatedValue +Output::constant_propagate(std::shared_ptr const& expr, int32_t schema_id) { + if (std::dynamic_pointer_cast(expr)) { + bool any_unknown = false; + std::vector to_delete; + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + EvaluatedValue ret = constant_propagate(sub_expr, schema_id); + if (ret == EvaluatedValue::True) { + return expr->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } else if (ret == EvaluatedValue::False) { + // no need to add this sub expression to used expression set + // but mark it for deletion + to_delete.push_back(it); + } else /*if (ret == EvaluatedValue::Unknown)*/ { + any_unknown = true; + } + } + + if (any_unknown) { + // some unknowns -- delete guaranteed false entries, and + // propagate unknown + for (OpList::iterator& it : to_delete) { + expr->get_op_list().erase(it); + } + return EvaluatedValue::Unknown; + } else { + // no unknowns, and didn't early exit, so before inversion the evaluated + // value must be False + return expr->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } + } else if (std::dynamic_pointer_cast(expr)) { + bool any_unknown = true; + std::vector to_delete; + for (auto it = expr->op_begin(); it != expr->op_end(); it++) { + auto subExpr = std::static_pointer_cast(*it); + + EvaluatedValue ret = constant_propagate(subExpr, schema_id); + + if (ret == EvaluatedValue::False) { + return expr->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } else if (ret == EvaluatedValue::True) { + // no need to add this sub expression to used expression set + // but mark it for deletion + to_delete.push_back(it); + } else /*if (ret == EvaluatedValue::Unknown)*/ { + any_unknown = true; + } + } + + if (any_unknown) { + // some unknowns -- delete guaranteed true entries, and + // propagate unknown + for (OpList::iterator& it : to_delete) { + expr->get_op_list().erase(it); + } + return EvaluatedValue::Unknown; + } else { + // no unknowns, and didn't early exit, so before inversion the evaluated + // value must be True + return expr->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } + return EvaluatedValue::Unknown; + } else if (auto filter = std::dynamic_pointer_cast(expr)) { + if ((filter->get_operation() == FilterOperation::EXISTS + || filter->get_operation() == FilterOperation::NEXISTS) + && (!filter->get_column()->has_unresolved_tokens() + || filter->get_column()->is_pure_wildcard() + || !filter->get_column()->matches_exactly(LiteralType::ArrayT))) + { + // semantics of previous passes means that EXISTS and NEXISTS are + // trivially matching + // FIXME: have an edgecase to handle with NEXISTS on pure wildcard columns + return EvaluatedValue::True; + } else if (filter->get_column()->is_pure_wildcard() && filter->get_column()->matches_any(LiteralType::ClpStringT | LiteralType::VarStringT)) + { + auto wildcard = filter->get_column().get(); + bool has_var_string = false; + bool matches_var_string = false; + bool has_clp_string = false; + bool matches_clp_string = false; + bool has_other = !m_wildcard_to_searched_columns[wildcard].empty() + || !m_wildcard_to_searched_datestrings[wildcard].empty() + || !m_wildcard_to_searched_floatdatestrings[wildcard].empty(); + std::string filter_string; + bool valid + = filter->get_operand()->as_var_string(filter_string, filter->get_operation()) + || filter->get_operand()->as_clp_string( + filter_string, + filter->get_operation() + ); + if (false == valid) { + // FIXME: throw + return EvaluatedValue::False; + } + if (filter->get_column()->matches_type(LiteralType::ClpStringT)) { + m_expr_clp_query[expr.get()] = &m_string_query_map.at(filter_string); + has_clp_string = !m_wildcard_to_searched_clpstrings[wildcard].empty(); + matches_clp_string + = !m_expr_clp_query.at(expr.get())->get_sub_queries().empty() + || m_expr_clp_query.at(expr.get())->search_string_matches_all(); + } + if (filter->get_column()->matches_type(LiteralType::VarStringT)) { + m_expr_var_match_map[expr.get()] = &m_string_var_match_map.at(filter_string); + has_var_string = !m_wildcard_to_searched_varstrings[wildcard].empty(); + matches_var_string = !m_expr_var_match_map.at(expr.get())->empty(); + } + + if (filter->get_operation() == FilterOperation::EQ) { + if (false == matches_clp_string) { + m_wildcard_to_searched_clpstrings[wildcard].clear(); + } + if (false == matches_var_string) { + m_wildcard_to_searched_varstrings[wildcard].clear(); + } + + if (has_other) { + return EvaluatedValue::Unknown; + } + + if (has_clp_string || has_var_string) { + if ((!has_clp_string || (has_clp_string && !matches_clp_string)) + && (!has_var_string || (has_var_string && !matches_var_string))) + { + return filter->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } + } + } else if (filter->get_operation() == FilterOperation::NEQ) { + if (has_clp_string && !matches_clp_string || has_var_string && !matches_var_string) + { + return filter->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } else if (false == has_clp_string && false == has_var_string && !has_other) { + return EvaluatedValue::False; + } + } else { + // FIXME: throw + return EvaluatedValue::False; + } + return EvaluatedValue::Unknown; + } else if (filter->get_column()->matches_type(LiteralType::ClpStringT)) { + std::string filter_string; + filter->get_operand()->as_clp_string(filter_string, filter->get_operation()); + + // set up string query for this filter + m_expr_clp_query[expr.get()] = &m_string_query_map.at(filter_string); + + // use string queries to potentially propagate known result + if (m_expr_clp_query.at(expr.get())->get_sub_queries().empty() + && !m_expr_clp_query.at(expr.get())->search_string_matches_all()) + { + // If filter can not match then return it's guaranteed value based on + // whether the filter is inverted and whether the operation was == or != + if (filter->get_operation() == FilterOperation::EQ) { + return filter->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } else if (filter->get_operation() == FilterOperation::NEQ) { + return filter->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } + // FIXME: throw + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + } else if (filter->get_column()->matches_type(LiteralType::VarStringT)) { + std::string filter_string; + filter->get_operand()->as_var_string(filter_string, filter->get_operation()); + + // set up string query for this filter + m_expr_var_match_map[expr.get()] = &m_string_var_match_map.at(filter_string); + + // use string queries to potentially propagate known result + if (m_expr_var_match_map.at(expr.get())->empty()) { + // If filter can not match then return it's guaranteed value based on + // whether the filter is inverted and whether the operation was == or != + if (filter->get_operation() == FilterOperation::EQ) { + return filter->is_inverted() ? EvaluatedValue::True : EvaluatedValue::False; + } else if (filter->get_operation() == FilterOperation::NEQ) { + return filter->is_inverted() ? EvaluatedValue::False : EvaluatedValue::True; + } + // FIXME: throw + return EvaluatedValue::False; + } else { + return EvaluatedValue::Unknown; + } + } else { + return EvaluatedValue::Unknown; + } + } + + return EvaluatedValue::Unknown; +} + +bool Output::evaluate_epoch_date_filter( + FilterOperation op, + DateStringColumnReader* reader, + std::shared_ptr& operand +) { + return evaluate_int_filter(op, reader->get_encoded_time(m_cur_message), operand); +} + +bool Output::evaluate_float_date_filter( + FilterOperation op, + FloatDateStringColumnReader* reader, + std::shared_ptr& operand +) { + return evaluate_float_filter(op, reader->get_encoded_time(m_cur_message), operand); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/Output.hpp b/components/core/src/clp_s/search/Output.hpp new file mode 100644 index 000000000..36afac9c9 --- /dev/null +++ b/components/core/src/clp_s/search/Output.hpp @@ -0,0 +1,338 @@ +#ifndef CLP_S_SEARCH_OUTPUT_HPP +#define CLP_S_SEARCH_OUTPUT_HPP + +#include +#include +#include +#include +#include + +#include + +#include "../SchemaReader.hpp" +#include "../Utils.hpp" +#include "clp_search/Query.hpp" +#include "Expression.hpp" +#include "Integral.hpp" +#include "SchemaMatch.hpp" +#include "StringLiteral.hpp" + +using namespace simdjson; +using nlohmann::json; +using namespace clp_s::search::clp_search; + +namespace clp_s::search { +class Output : public FilterClass { +public: + Output(std::shared_ptr tree, + std::shared_ptr schemas, + SchemaMatch& match, + std::shared_ptr expr, + std::string archives_dir, + std::shared_ptr timestamp_dict) + : m_schema_tree(std::move(tree)), + m_schemas(std::move(schemas)), + m_match(match), + m_expr(std::move(expr)), + m_archives_dir(std::move(archives_dir)), + m_timestamp_dict(std::move(timestamp_dict)) {} + + /** + * Filters messages from all archives + */ + void filter(); + +private: + SchemaMatch& m_match; + std::shared_ptr m_expr; + std::string m_archives_dir; + + // variables for the current schema being filtered + std::vector m_searched_columns; + std::vector m_other_columns; + std::set m_cached_string_columns; + + int32_t m_schema; + SchemaReader* m_reader; + + std::shared_ptr m_schema_tree; + std::shared_ptr m_var_dict; + std::shared_ptr m_log_dict; + std::shared_ptr m_array_dict; + std::shared_ptr m_timestamp_dict; + + std::shared_ptr m_schemas; + + std::map m_string_query_map; + std::map> m_string_var_match_map; + std::unordered_map m_expr_clp_query; + std::unordered_map*> m_expr_var_match_map; + std::unordered_map m_clp_string_readers; + std::unordered_map m_var_string_readers; + std::unordered_map m_datestring_readers; + std::unordered_map m_floatdatestring_readers; + uint64_t m_cur_message; + EvaluatedValue m_expression_value; + + std::map> m_wildcard_to_searched_clpstrings; + std::map> m_wildcard_to_searched_varstrings; + std::map> m_wildcard_to_searched_datestrings; + std::map> m_wildcard_to_searched_floatdatestrings; + std::map> m_wildcard_to_searched_columns; + + simdjson::ondemand::parser m_array_parser; + std::string m_array_search_string; + bool m_maybe_string, m_maybe_number; + + /** + * Initializes the variables. It is init is called once for each schema after which filter + * is called once for every message in the schema + * @param reader + * @param schema_id + * @param columns + */ + void init( + SchemaReader* reader, + int32_t schema_id, + std::unordered_map& columns + ) override; + + /** + * Evaluates an expression + * @param expr + * @param schema + * @param extracted_values + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate( + Expression* expr, + int32_t schema, + std::map>& extracted_values + ); + + /** + * Evaluates a filter expression + * @param expr + * @param schema + * @param extracted_values + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_filter( + FilterExpr* expr, + int32_t schema, + std::map>& extracted_values + ); + + /** + * Evaluates a wildcard filter expression + * @param expr + * @param schema + * @param extracted_values + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_wildcard_filter( + FilterExpr* expr, + int32_t schema, + std::map>& extracted_values + ); + + /** + * Evaluates a int filter expression + * @param op + * @param value + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + static bool + evaluate_int_filter(FilterOperation op, int64_t value, std::shared_ptr const& operand); + + /** + * Evaluates a float filter expression + * @param op + * @param value + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + static bool evaluate_float_filter( + FilterOperation op, + double value, + std::shared_ptr const& operand + ); + + /** + * Evaluates a clp string filter expression + * @param op + * @param q + * @param column_id + * @param operand + * @param extracted_values + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_clp_string_filter( + FilterOperation op, + Query* q, + int32_t column_id, + std::shared_ptr const& operand, + std::map>& extracted_values + ); + + /** + * Evaluates a var string filter expression + * @param op + * @param reader + * @param matching_vars + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_var_string_filter( + FilterOperation op, + VariableStringColumnReader* reader, + std::unordered_set* matching_vars, + std::shared_ptr const& operand + ) const; + + /** + * Evaluates a epoch date string filter expression + * @param op + * @param reader + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_epoch_date_filter( + FilterOperation op, + DateStringColumnReader* reader, + std::shared_ptr& operand + ); + + /** + * Evaluates a float date string filter expression + * @param op + * @param reader + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_float_date_filter( + FilterOperation op, + FloatDateStringColumnReader* reader, + std::shared_ptr& operand + ); + + /** + * Evaluates an array filter expression + * @param op + * @param unresolved_tokens + * @param value + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_array_filter( + FilterOperation op, + DescriptorList const& unresolved_tokens, + std::string const& value, + std::shared_ptr const& operand + ) const; + + /** + * The implementation of evaluate_array_filter + * @param object + * @param op + * @param unresolved_tokens + * @param cur_idx + * @param value + * @param operand + * @param array_or_object if true, we are traversing an array + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_array_filter( + json& object, + FilterOperation op, + DescriptorList const& unresolved_tokens, + size_t cur_idx, + std::shared_ptr const& operand, + bool array_or_object + ) const; + + /** + * Evaluates a wildcard array filter expression + * @param op + * @param value + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_wildcard_array_filter( + FilterOperation op, + std::string& value, + std::shared_ptr const& operand + ); + + /** + * The implementation of evaluate_wildcard_array_filter + * @param array + * @param op + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_wildcard_array_filter( + ondemand::array& array, + FilterOperation op, + std::shared_ptr const& operand + ) const; + + /** + * The implementation of evaluate_wildcard_array_filter + * @param object + * @param op + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + bool evaluate_wildcard_array_filter( + ondemand::object& object, + FilterOperation op, + std::shared_ptr const& operand + ) const; + + /** + * Evaluates a bool filter expression + * @param op + * @param value + * @param operand + * @return true if the expression evaluates to true, false otherwise + */ + static bool + evaluate_bool_filter(FilterOperation op, bool value, std::shared_ptr const& operand); + + /** + * Populates the string queries + * @param expr + */ + void populate_string_queries(std::shared_ptr const& expr); + + /** + * Constant propagates an expression + * @param expr + * @param schema_id + * @return EvaluatedValue::True if the expression evaluates to true, EvaluatedValue::False + * if the expression evaluates to false, EvaluatedValue::Unknown otherwise + */ + EvaluatedValue constant_propagate(std::shared_ptr const& expr, int32_t schema_id); + + /** + * Populates searched wildcard columns + * @param expr + */ + void populate_searched_wildcard_columns(std::shared_ptr const& expr); + + /** + * Adds wildcard columns to searched columns + */ + void add_wildcard_columns_to_searched_columns(); + + // Methods inherited from FilterClass + bool filter( + uint64_t cur_message, + std::map>& extracted_values + ) override; +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_OUTPUT_HPP diff --git a/components/core/src/clp_s/search/SchemaMatch.cpp b/components/core/src/clp_s/search/SchemaMatch.cpp new file mode 100644 index 000000000..83ef44b6a --- /dev/null +++ b/components/core/src/clp_s/search/SchemaMatch.cpp @@ -0,0 +1,452 @@ +#include "SchemaMatch.hpp" + +#include +#include + +#include "AndExpr.hpp" +#include "ConstantProp.hpp" +#include "EmptyExpr.hpp" +#include "OrExpr.hpp" +#include "OrOfAndForm.hpp" +#include "SearchUtils.hpp" + +namespace clp_s::search { +// TODO: write proper iterators on the AST to make this code less awful. +// In particular schema intersection needs AST iterators and a proper refactor +SchemaMatch::SchemaMatch( + std::shared_ptr tree, + std::shared_ptr schemas +) + : m_tree(std::move(tree)), + m_schemas(std::move(schemas)) {} + +std::shared_ptr SchemaMatch::run(std::shared_ptr& expr) { + ConstantProp propagate_empty; + expr = populate_column_mapping(expr); + expr = propagate_empty.run(expr); + if (std::dynamic_pointer_cast(expr)) { + return expr; + } + + // if we had ambiguous column descriptors containing regex which were + // resolved we need to restandardize the expression + if (false == m_unresolved_descriptor_to_descriptor.empty()) { + m_column_to_descriptor.clear(); + m_unresolved_descriptor_to_descriptor.clear(); + + // restandardize the form, and rerun column mapping + OrOfAndForm standard_form; + expr = standard_form.run(expr); + expr = populate_column_mapping(expr); + } + + populate_schema_mapping(); + + expr = intersect_schemas(expr); + expr = propagate_empty.run(expr); + + if (std::dynamic_pointer_cast(expr)) { + return expr; + } + + split_expression_by_schema(expr, m_schema_to_query, m_matched_schema_ids); + + return expr; +} + +std::shared_ptr SchemaMatch::populate_column_mapping(std::shared_ptr cur) { + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + if (auto child = std::dynamic_pointer_cast(*it)) { + auto new_child = populate_column_mapping(child); + if (new_child != child) { + new_child->copy_replace(cur.get(), it); + } + } else if (auto column = dynamic_cast((*it).get())) { + if (false == populate_column_mapping(column)) { + // no matching columns -- replace this expression with empty; + return EmptyExpr::create(); + } else if (column->is_unresolved_descriptor() && false == column->is_pure_wildcard()) { + auto possibilities = OrExpr::create(); + + // TODO: will have to decide how we wan't to handle multi-column expressions + // with unresolved descriptors + for (int32_t node_id : m_unresolved_descriptor_to_descriptor[column]) { + auto node = m_tree->get_node(node_id); + auto literal_type = node_to_literal_type(node->get_type()); + DescriptorList descriptors; + while (node->get_id() != m_tree->get_root_node_id()) { + // may have to explicitly mark non-regex + descriptors.emplace_back(node->get_key_name()); + node = m_tree->get_node(node->get_parent_id()); + } + std::reverse(descriptors.begin(), descriptors.end()); + auto resolved_column = ColumnDescriptor::create(descriptors); + resolved_column->set_matching_type(literal_type); + *it = resolved_column; + cur->copy_append(possibilities.get()); + } + return possibilities; + } + } + } + return cur; +} + +bool SchemaMatch::populate_column_mapping(ColumnDescriptor* column) { + bool matched = false; + if (column->is_pure_wildcard()) { + for (auto& node : m_tree->get_nodes()) { + if (column->matches_type(node_to_literal_type(node->get_type()))) { + // column_to_descriptor_[node->get_id()].insert(column); + // At least some node matches; break + // Don't use column_to_descriptor_ for pure wildcard columns anyway, so + // no need to waste memory + matched = true; + break; + } + } + + return matched; + } + + auto root = m_tree->get_node(m_tree->get_root_node_id()); + for (int32_t child_node_id : root->get_children_ids()) { + matched |= populate_column_mapping(column, column->descriptor_begin(), child_node_id); + } + + return matched; +} + +bool SchemaMatch::populate_column_mapping( + ColumnDescriptor* column, + DescriptorList::iterator it, + int32_t node_id, + bool wildcard_special_flag +) { + if (it == column->descriptor_end()) { + return false; + } + + bool matched = false; + bool accepted = false, wildcard_accepted = false; + auto cur_node = m_tree->get_node(node_id); + DescriptorToken const& token = *it; + auto next = it; + next++; + + // accept current token + if (token.wildcard()) { + accepted = true; + wildcard_accepted = true; + } else if (cur_node->get_key_name() == token.get_token()) { + accepted = true; + } + + if (accepted) { + // For array search, users need to specify the full path + if (cur_node->get_type() == NodeType::ARRAY && !column->is_unresolved_descriptor()) { + matched = true; + column->add_unresolved_tokens(next); + m_column_to_descriptor[node_id].insert(column); + } else if ((next == column->descriptor_end() + && column->matches_type(node_to_literal_type(cur_node->get_type())))) + { + // potentially match current node if accepted its token + matched = true; + if (false == column->is_unresolved_descriptor()) { + m_column_to_descriptor[node_id].insert(column); + } else { + m_unresolved_descriptor_to_descriptor[column].insert(node_id); + } + } + } else { + return matched; + } + + // handle wildcard match 0 case + bool wildcard_special_continue = (wildcard_special_flag || !wildcard_accepted) + && next != column->descriptor_end() && next->wildcard(); + if (wildcard_special_continue) { + // have to allow matching current node again to honour + // 0 or more matches. Set the wildcard special flag to avoid matching + // the following case erroneously + // tok.*.tok + matched |= populate_column_mapping(column, next, node_id, true); + } else if (false == wildcard_special_flag && wildcard_accepted) { + matched |= populate_column_mapping(column, next, node_id); + } + + // match against children + for (int32_t child_node_id : cur_node->get_children_ids()) { + if (wildcard_accepted && !wildcard_special_continue) { + matched |= populate_column_mapping(column, next, child_node_id); + matched |= populate_column_mapping(column, it, child_node_id); + } else if (false == wildcard_accepted) { + matched |= populate_column_mapping(column, next, child_node_id); + } + } + + return matched; +} + +void SchemaMatch::populate_schema_mapping() { + // TODO: consider refactoring this now that schemas are std::set s + for (auto& it : *m_schemas) { + int32_t schema_id = it.first; + for (int32_t column_id : it.second) { + if (m_tree->get_node(column_id)->get_type() == NodeType::ARRAY) { + m_array_schema_ids.insert(schema_id); + } + if (false == m_column_to_descriptor.count(column_id)) { + continue; + } + for (auto descriptor : m_column_to_descriptor[column_id]) { + if (false == descriptor->is_pure_wildcard()) { + m_descriptor_to_schema[descriptor][schema_id] = column_id; + } + } + } + } +} + +std::shared_ptr SchemaMatch::intersect_schemas(std::shared_ptr cur) { + if (std::dynamic_pointer_cast(cur) || std::dynamic_pointer_cast(cur)) { + std::set common_schema; + std::set columns; + intersect_and_sub_expr(cur, common_schema, columns, true); + + if (common_schema.empty()) { + return EmptyExpr::create(cur->get_parent()); + } + + for (int32_t schema_id : common_schema) { + m_expression_to_schemas[cur.get()].insert(schema_id); + } + + for (auto column : columns) { + if (column->is_pure_wildcard()) { + continue; + } + + LiteralTypeBitmask types = 0; + for (int32_t schema : common_schema) { + if (m_descriptor_to_schema[column].count(schema)) { + types |= node_to_literal_type( + m_tree->get_node(m_descriptor_to_schema[column][schema])->get_type() + ); + } + } + column->set_matching_types(types); + } + + for (int32_t schema : common_schema) { + m_matched_schema_ids.insert(schema); + + for (auto column : columns) { + if (false == column->is_pure_wildcard()) { + m_schema_to_searched_columns[schema].insert( + get_column_id_for_descriptor(column, schema) + ); + } + } + } + } else if (cur->has_only_expression_operands()) { + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + auto new_expr = intersect_schemas(sub_expr); + + if (new_expr != sub_expr) { + *it = new_expr; + } + } + } + return cur; +} + +bool SchemaMatch::intersect_and_sub_expr( + std::shared_ptr const& cur, + std::set& common_schema, + std::set& columns, + bool first +) { + // Note: EmptyExpr are already constant propogated out of the ands, so don't + // need to check for them here + for (auto it = cur->op_begin(); it != cur->op_end(); it++) { + if (auto sub_expr = std::dynamic_pointer_cast(*it)) { + first &= intersect_and_sub_expr(sub_expr, common_schema, columns, first); + if (false == first && common_schema.empty()) { + break; + } + } else if (auto column = std::dynamic_pointer_cast(*it)) { + FilterOperation op = std::static_pointer_cast(cur)->get_operation(); + if ((op != FilterOperation::EXISTS && op != FilterOperation::NEXISTS) + || column->has_unresolved_tokens()) + { + columns.insert(column.get()); + } + + if (column->is_pure_wildcard()) { + // TODO: consider handling `*:null` NEXISTS edgecase here instead of during + // output + if (first) { + for (auto schema_it : *m_schemas) { + common_schema.insert(schema_it.first); + } + } + return false; + } else if (first && op != FilterOperation::NEXISTS) { + for (auto schema_it : m_descriptor_to_schema[column.get()]) { + common_schema.insert(schema_it.first); + } + return false; + } else if (first /*&& op == FilterOperation::NEXISTS */) { + for (auto& schema : *m_schemas) { + if (0 == m_descriptor_to_schema[column.get()].count(schema.first)) { + common_schema.insert(schema.first); + } + } + return false; + } else if (op == FilterOperation::NEXISTS) { + std::set intersection; + auto const& cur_schemas = m_descriptor_to_schema[column.get()]; + for (int32_t schema : common_schema) { + if (0 == cur_schemas.count(schema)) { + intersection.insert(schema); + } + } + common_schema = intersection; + } else { + std::set intersection; + auto const& cur_schemas = m_descriptor_to_schema[column.get()]; + for (int32_t schema : common_schema) { + if (cur_schemas.count(schema)) { + intersection.insert(schema); + } + } + common_schema = intersection; + } + } + } + return first; +} + +void SchemaMatch::split_expression_by_schema( + std::shared_ptr const& expr, + std::map>& queries, + std::unordered_set const& relevant_schemas +) { + if (auto filter = std::dynamic_pointer_cast(expr)) { + for (int32_t schema_id : relevant_schemas) { + auto new_filter = filter->copy(); + auto descriptor = std::static_pointer_cast(new_filter)->get_column().get(); + auto old_descriptor = filter->get_column().get(); + + if (false == descriptor->is_pure_wildcard()) { + descriptor->set_column_id(get_column_id_for_descriptor(old_descriptor, schema_id)); + auto literal_type = get_literal_type_for_column(old_descriptor, schema_id); + if (literal_type == LiteralType::ArrayT) { + m_array_search_schema_ids.insert(schema_id); + } + descriptor->set_matching_type(literal_type); + } else if ((descriptor->is_pure_wildcard() + && descriptor->matches_type(LiteralType::ArrayT) + && 0 == m_array_search_schema_ids.count(schema_id))) + { + for (auto column_id : (*m_schemas)[schema_id]) { + if (m_tree->get_node(column_id)->get_type() == NodeType::ARRAY) { + m_array_search_schema_ids.insert(schema_id); + break; + } + } + } + queries[schema_id] = new_filter; + } + } else if (std::dynamic_pointer_cast(expr)) { + std::map> sub_expressions; + for (auto const& op : expr->get_op_list()) { + auto sub_expr = std::static_pointer_cast(op); + split_expression_by_schema(sub_expr, sub_expressions, relevant_schemas); + + for (auto const& it : sub_expressions) { + if (queries.count(it.first)) { + it.second->copy_append(queries[it.first].get()); + } else { + auto parent_expr = AndExpr::create(expr->is_inverted()); + it.second->copy_append(parent_expr.get()); + queries[it.first] = parent_expr; + } + } + + sub_expressions.clear(); + } + } else if (std::dynamic_pointer_cast(expr)) { + std::map> sub_expressions; + for (auto const& op : expr->get_op_list()) { + auto sub_expr = std::static_pointer_cast(op); + split_expression_by_schema( + sub_expr, + sub_expressions, + m_expression_to_schemas.at(sub_expr.get()) + ); + + for (auto const& it : sub_expressions) { + if (queries.count(it.first)) { + auto& cur_subexpr = queries[it.first]; + if (std::dynamic_pointer_cast(cur_subexpr)) { + it.second->copy_append(cur_subexpr.get()); + } else { + auto parent_expr = OrExpr::create(); + cur_subexpr->copy_append(parent_expr.get()); + it.second->copy_append(parent_expr.get()); + queries[it.first] = parent_expr; + } + } else { + queries[it.first] = it.second; + } + } + + sub_expressions.clear(); + } + + if (expr->is_inverted()) { + for (auto const& it : queries) { + it.second->invert(); + } + } + } +} + +int32_t SchemaMatch::get_column_id_for_descriptor(ColumnDescriptor* column, int32_t schema) { + return m_descriptor_to_schema[column][schema]; +} + +bool SchemaMatch::schema_matched(int32_t schema_id) { + return m_matched_schema_ids.count(schema_id); +} + +bool SchemaMatch::schema_searches_against_column(int32_t schema, int32_t column_id) { + return m_schema_to_searched_columns[schema].count(column_id); +} + +void SchemaMatch::add_searched_column_to_schema(int32_t schema, int32_t column) { + m_schema_to_searched_columns[schema].insert(column); +} + +bool SchemaMatch::has_array(int32_t schema_id) { + return m_array_schema_ids.count(schema_id); +} + +bool SchemaMatch::has_array_search(int32_t schema_id) { + return m_array_search_schema_ids.count(schema_id); +} + +LiteralType SchemaMatch::get_literal_type_for_column(ColumnDescriptor* column, int32_t schema) { + return node_to_literal_type( + m_tree->get_node(get_column_id_for_descriptor(column, schema))->get_type() + ); +} + +std::shared_ptr SchemaMatch::get_query_for_schema(int32_t schema) { + return m_schema_to_query.at(schema); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/SchemaMatch.hpp b/components/core/src/clp_s/search/SchemaMatch.hpp new file mode 100644 index 000000000..abee8628b --- /dev/null +++ b/components/core/src/clp_s/search/SchemaMatch.hpp @@ -0,0 +1,172 @@ +#ifndef CLP_S_SEARCH_SCHEMAMATCH_HPP +#define CLP_S_SEARCH_SCHEMAMATCH_HPP + +#include +#include +#include +#include + +#include "../ReaderUtils.hpp" +#include "ColumnDescriptor.hpp" +#include "Expression.hpp" +#include "FilterExpr.hpp" +#include "Literal.hpp" +#include "Transformation.hpp" + +namespace clp_s::search { +class SchemaMatch : public Transformation { +public: + // Constructor + SchemaMatch(std::shared_ptr tree, std::shared_ptr schemas); + + /** + * Runs the transformation on an expression + * @param expr + * @return The transformed expression + */ + std::shared_ptr run(std::shared_ptr& expr) override; + + /** + * @param schema + * @return The query for a given schema + */ + std::shared_ptr get_query_for_schema(int32_t schema); + + /** + * Checks if a schema has been matched + * @param schema_id + * @return true if the schema has been matched, false otherwise + */ + bool schema_matched(int32_t schema_id); + + /** + * Checks if the column + * @param schema + * @param column_id + * @return true if the column has been matched, false otherwise + */ + bool schema_searches_against_column(int32_t schema, int32_t column_id); + + /** + * Adds a searched column to the schema. only used for pure wildcard + * @param schema + * @param column + */ + void add_searched_column_to_schema(int32_t schema, int32_t column); + + /** + * Checks if the schema has an array field + * @param schema_id + * @return true if the schema has, false otherwise + */ + bool has_array(int32_t schema_id); + + /** + * Checks if the schema has an array field to be searched against + * @param schema_id + * @return true if the schema has, false otherwise + */ + bool has_array_search(int32_t schema_id); + +private: + std::unordered_map> m_column_to_descriptor; + // TODO: The value in the map can be a set of k:v pairs with a hash & comparison + // that only considers the key since each column descriptor only has one matching + // column id per schema + std::unordered_map> m_descriptor_to_schema; + std::map> m_unresolved_descriptor_to_descriptor; + std::unordered_map> m_expression_to_schemas; + std::unordered_set m_matched_schema_ids; + std::unordered_set m_array_schema_ids; + std::unordered_set m_array_search_schema_ids; + std::map> m_schema_to_query; + + std::unordered_map> m_schema_to_searched_columns; + std::shared_ptr m_tree; + std::shared_ptr m_schemas; + + /** + * Populates the column mapping for a given column + * @param column + * @param it + * @param node_id + * @param wildcard_special_flag + * @return true if matching is successful, false otherwise + */ + bool populate_column_mapping( + ColumnDescriptor* column, + DescriptorList::iterator it, + int32_t node_id, + bool wildcard_special_flag = false + ); + + /** + * Populates the column mapping for a given column + * @param column + * @return + */ + bool populate_column_mapping(ColumnDescriptor* column); + + /** + * Populates the column mapping for a given expression + * @param cur + * @return The transformed expression + */ + std::shared_ptr populate_column_mapping(std::shared_ptr cur); + + /** + * Populates the schema mapping + */ + void populate_schema_mapping(); + + /** + * Finds common schemas and relevant columns across filters and stores the mapping + * @param cur + * @return The transformed expression + */ + std::shared_ptr intersect_schemas(std::shared_ptr cur); + + /** + * Finds common schemas and relevant columns across filters + * @param cur + * @param common_schema + * @param columns + * @param first + * @return true before firstly processing common schemas, false otherwise + */ + bool intersect_and_sub_expr( + std::shared_ptr const& cur, + std::set& common_schema, + std::set& columns, + bool first + ); + + /** + * Splits an expression into sub-expressions based on the schemas it searches against + * @param expr + * @param queries a map from schema id to expression + * @param relevant_schemas + */ + void split_expression_by_schema( + std::shared_ptr const& expr, + std::map>& queries, + std::unordered_set const& relevant_schemas + ); + + /** + * @param column + * @param schema + * @return The column id for a given column descriptor + */ + int32_t get_column_id_for_descriptor(ColumnDescriptor* column, int32_t schema); + + /** + * @param column + * @param schema + * @return The literal type for a given column descriptor + */ + LiteralType get_literal_type_for_column(ColumnDescriptor* column, int32_t schema); +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_SCHEMAMATCH_HPP diff --git a/components/core/src/clp_s/search/SearchUtils.cpp b/components/core/src/clp_s/search/SearchUtils.cpp new file mode 100644 index 000000000..c255c1f38 --- /dev/null +++ b/components/core/src/clp_s/search/SearchUtils.cpp @@ -0,0 +1,87 @@ +#include "SearchUtils.hpp" + +#include + +namespace clp_s::search { +void splice_into( + std::shared_ptr const& parent, + std::shared_ptr const& child, + OpList::iterator location +) { + for (auto it = child->op_begin(); it != child->op_end(); it++) { + auto sub_expr = std::static_pointer_cast(*it); + sub_expr->set_parent(parent.get()); + } + parent->get_op_list().splice(location, child->get_op_list()); +} + +// TODO: make sure to handle Object types correctly +LiteralType node_to_literal_type(NodeType type) { + switch (type) { + case NodeType::INTEGER: + return LiteralType::IntegerT; + case NodeType::FLOAT: + return LiteralType::FloatT; + case NodeType::CLPSTRING: + return LiteralType::ClpStringT; + case NodeType::VARSTRING: + return LiteralType::VarStringT; + case NodeType::BOOLEAN: + return LiteralType::BooleanT; + case NodeType::ARRAY: + return LiteralType::ArrayT; + case NodeType::NULLVALUE: + return LiteralType::NullT; + case NodeType::DATESTRING: + return LiteralType::EpochDateT; + case NodeType::FLOATDATESTRING: + return LiteralType::FloatDateT; + default: + return LiteralType::UnknownT; + } +} + +bool double_as_int(double in, FilterOperation op, int64_t& out) { + switch (op) { + case FilterOperation::EQ: + out = static_cast(in); + return in == static_cast(out); + case FilterOperation::LT: + case FilterOperation::GTE: + out = std::ceil(in); + case FilterOperation::GT: + case FilterOperation::LTE: + out = std::floor(in); + default: + out = static_cast(in); + } + return true; +} + +bool wildcard_match(std::string_view s, std::string_view p) { + size_t i, j, star, last; + i = j = 0; + star = last = -1; + + while (i < s.length()) { + if (j < p.length() && (s[i] == p[j] || p[j] == '?')) { + ++i; + ++j; + } else if (j < p.length() && p[j] == '*') { + star = j++; + last = i; + } else if (star != -1) { + i = last++; + j = star + 1; + } else { + return false; + } + } + + while (j < p.length() && p[j] == '*') { + ++j; + } + + return j == p.length(); +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/SearchUtils.hpp b/components/core/src/clp_s/search/SearchUtils.hpp new file mode 100644 index 000000000..1ba8719e3 --- /dev/null +++ b/components/core/src/clp_s/search/SearchUtils.hpp @@ -0,0 +1,48 @@ +#ifndef CLP_S_SEARCH_SEARCHUTILS_HPP +#define CLP_S_SEARCH_SEARCHUTILS_HPP + +#include "../SchemaTree.hpp" +#include "Expression.hpp" +#include "Literal.hpp" + +namespace clp_s::search { + +/** + * Splice a child expression into a parent expression at a given location + * @param parent + * @param child + * @param location + */ +void splice_into( + std::shared_ptr const& parent, + std::shared_ptr const& child, + OpList::iterator location +); + +/** + * Converts a node type to a literal type + * @param type + * @return A literal type + */ +LiteralType node_to_literal_type(NodeType type); + +/** + * Casts a double to an int64_t, rounding up or down depending on the filter operation + * @param in + * @param op + * @param out + * @return false if under FilterOperation::EQ the cast double is not equal to int64_t out, true + * otherwise + */ +bool double_as_int(double in, FilterOperation op, int64_t& out); + +/** + * Performs a wildcard match of a string against a pattern + * @param s the string to match + * @param p the pattern to match against + * @return true if s matches p, false otherwise + */ +bool wildcard_match(std::string_view s, std::string_view p); +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_SEARCHUTILS_HPP diff --git a/components/core/src/clp_s/search/StringLiteral.cpp b/components/core/src/clp_s/search/StringLiteral.cpp new file mode 100644 index 000000000..63600ee7f --- /dev/null +++ b/components/core/src/clp_s/search/StringLiteral.cpp @@ -0,0 +1,95 @@ +#include "StringLiteral.hpp" + +#include + +#include "SearchUtils.hpp" + +namespace clp_s::search { +std::shared_ptr StringLiteral::create(std::string const& v) { + return std::shared_ptr(static_cast(new StringLiteral(v))); +} + +void StringLiteral::print() { + get_print_stream() << "\"" << m_v << "\""; +} + +std::string& StringLiteral::get() { + return m_v; +} + +bool StringLiteral::as_clp_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + + if (false == matches_type(LiteralType::ClpStringT)) { + return false; + } + + ret = m_v; + return true; +} + +bool StringLiteral::as_var_string(std::string& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + + if (false == matches_type(LiteralType::VarStringT)) { + return false; + } + + ret = m_v; + return true; +} + +bool StringLiteral::as_float(double& ret, FilterOperation op) { + std::istringstream ss(m_v); + ss >> std::noskipws >> ret; + return !ss.fail() && ss.eof(); +} + +bool StringLiteral::as_int(int64_t& ret, FilterOperation op) { + std::istringstream ss(m_v); + ss >> std::noskipws >> ret; + if (false == ss.fail() && ss.eof()) { + return true; + } else { + double tmp; + ss = std::istringstream(m_v); + ss >> std::noskipws >> tmp; + if (false == ss.fail() && ss.eof()) { + return double_as_int(tmp, op, ret); + } + } + return false; +} + +bool StringLiteral::as_bool(bool& ret, FilterOperation op) { + if (op == FilterOperation::LT || op == FilterOperation::GT || op == FilterOperation::LTE + || op == FilterOperation::GTE) + { + return false; + } + if (m_v == "true") { + ret = true; + return true; + } else if (m_v == "false") { + ret = false; + return true; + } + return false; +} + +bool StringLiteral::as_null(FilterOperation op) { + return (op == FilterOperation::EQ || op == FilterOperation::NEQ) && m_v == "null"; +} + +bool StringLiteral::as_any(FilterOperation op) { + return (op == FilterOperation::EQ || op == FilterOperation::NEQ) && m_v == "*"; +} +} // namespace clp_s::search diff --git a/components/core/src/clp_s/search/StringLiteral.hpp b/components/core/src/clp_s/search/StringLiteral.hpp new file mode 100644 index 000000000..cd05f59fe --- /dev/null +++ b/components/core/src/clp_s/search/StringLiteral.hpp @@ -0,0 +1,78 @@ +#ifndef CLP_S_SEARCH_STRINGLITERAL_HPP +#define CLP_S_SEARCH_STRINGLITERAL_HPP + +#include +#include + +#include "Literal.hpp" + +namespace clp_s::search { +/** + * Class for String literals in the search AST + * + * StringLiteral will automatically classify itself as possibly matching + * a clp style (containing spaces) and/or variable style (not containing spaces) + * string at creation time. + */ +class StringLiteral : public Literal { +public: + // Deleted copy + StringLiteral(StringLiteral const&) = delete; + StringLiteral& operator=(StringLiteral const&) = delete; + + /** + * Create a StringLiteral from a string + * @param v + * @return A new StringLiteral + */ + static std::shared_ptr create(std::string const& v); + + /** + * @return Reference to underlying string + */ + std::string& get(); + + // Methods inherited from Value + void print() override; + + // Methods inherited from Literal + bool matches_type(LiteralType type) override { return type & m_string_type; } + + bool matches_any(LiteralTypeBitmask mask) override { return mask & m_string_type; } + + bool matches_exactly(LiteralTypeBitmask mask) override { return mask == m_string_type; } + + bool as_clp_string(std::string& ret, FilterOperation op) override; + + bool as_var_string(std::string& ret, FilterOperation op) override; + + bool as_float(double& ret, FilterOperation op) override; + + bool as_int(int64_t& ret, FilterOperation op) override; + + bool as_bool(bool& ret, FilterOperation op) override; + + bool as_null(FilterOperation op) override; + + bool as_any(FilterOperation op) override; + +private: + std::string m_v; + LiteralTypeBitmask m_string_type; + + // Constructor + explicit StringLiteral(std::string v) : m_v(std::move(v)), m_string_type(0) { + if (m_v.find(' ') != std::string::npos) { + m_string_type = LiteralType::ClpStringT; + } else { + m_string_type = LiteralType::VarStringT; + } + + if (m_v.find('*') != std::string::npos) { + m_string_type |= LiteralType::ClpStringT; + } + } +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_STRINGLITERAL_HPP diff --git a/components/core/src/clp_s/search/Transformation.hpp b/components/core/src/clp_s/search/Transformation.hpp new file mode 100644 index 000000000..a200f9ed5 --- /dev/null +++ b/components/core/src/clp_s/search/Transformation.hpp @@ -0,0 +1,21 @@ +#ifndef CLP_S_SEARCH_TRANSFORMATION_HPP +#define CLP_S_SEARCH_TRANSFORMATION_HPP + +#include "Expression.hpp" + +namespace clp_s::search { +/** + * Generic class representing a transformation on some expression. + */ +class Transformation { +public: + /** + * Runs the pass. The expression passed as input may be mutated by the pass. + * @param expr the expression that the pass will run on + * @return a new expression; may be the same as the input expression or different + */ + virtual std::shared_ptr run(std::shared_ptr& expr) = 0; +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_TRANSFORMATION_HPP diff --git a/components/core/src/clp_s/search/Value.hpp b/components/core/src/clp_s/search/Value.hpp new file mode 100644 index 000000000..a7e812294 --- /dev/null +++ b/components/core/src/clp_s/search/Value.hpp @@ -0,0 +1,33 @@ +#ifndef CLP_S_SEARCH_VALUE_HPP +#define CLP_S_SEARCH_VALUE_HPP + +#include + +namespace clp_s::search { +/** + * Class representing a generic value in the AST. Key subclasses are Literal and Expression. + */ +class Value { +public: + /** + * @return The number of operands this value has + */ + virtual unsigned get_num_operands() = 0; + + /** + * Print a string representation of the value to standard error. + * Useful for debugging in gdb. + */ + virtual void print() = 0; + + virtual ~Value() = default; + +protected: + /** + * @return The stream to print to + */ + static std::ostream& get_print_stream() { return std::cerr; } +}; +} // namespace clp_s::search + +#endif // CLP_S_SEARCH_VALUE_HPP diff --git a/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.cpp b/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.cpp new file mode 100644 index 000000000..241f3dde7 --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.cpp @@ -0,0 +1,75 @@ +// Code from CLP + +#include "EncodedVariableInterpreter.hpp" + +#include +#include + +#include + +#include "../../VariableEncoder.hpp" + +using std::string; +using std::unordered_set; +using std::vector; + +namespace clp_s::search::clp_search { +bool EncodedVariableInterpreter::encode_and_search_dictionary( + string const& var_str, + VariableDictionaryReader const& var_dict, + bool ignore_case, + string& logtype, + SubQuery& sub_query +) { + size_t length = var_str.length(); + if (0 == length) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + encoded_variable_t encoded_var; + if (VariableEncoder::convert_string_to_representable_integer_var(var_str, encoded_var)) { + LogTypeDictionaryEntry::add_non_double_var(logtype); + sub_query.add_non_dict_var(encoded_var); + } else if (VariableEncoder::convert_string_to_representable_double_var(var_str, encoded_var)) { + LogTypeDictionaryEntry::add_double_var(logtype); + sub_query.add_non_dict_var(encoded_var); + } else { + auto entry = var_dict.get_entry_matching_value(var_str, ignore_case); + if (nullptr == entry) { + // Not in dictionary + return false; + } + encoded_var = VariableEncoder::encode_var_dict_id(entry->get_id()); + + LogTypeDictionaryEntry::add_non_double_var(logtype); + sub_query.add_dict_var(encoded_var, entry); + } + + return true; +} + +bool EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( + std::string const& var_wildcard_str, + VariableDictionaryReader const& var_dict, + bool ignore_case, + SubQuery& sub_query +) { + // Find matches + unordered_set var_dict_entries; + var_dict.get_entries_matching_wildcard_string(var_wildcard_str, ignore_case, var_dict_entries); + if (var_dict_entries.empty()) { + // Not in dictionary + return false; + } + + // Encode matches + unordered_set encoded_vars; + for (auto const* entry : var_dict_entries) { + encoded_vars.insert(VariableEncoder::encode_var_dict_id(entry->get_id())); + } + + sub_query.add_imprecise_dict_var(encoded_vars, var_dict_entries); + + return true; +} +} // namespace clp_s::search::clp_search diff --git a/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.hpp b/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.hpp new file mode 100644 index 000000000..92e0907a2 --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/EncodedVariableInterpreter.hpp @@ -0,0 +1,84 @@ +// Code from CLP + +#ifndef CLP_S_SEARCH_CLP_SEARCH_ENCODEDVARIABLEINTERPRETER_HPP +#define CLP_S_SEARCH_CLP_SEARCH_ENCODEDVARIABLEINTERPRETER_HPP + +#include +#include + +#include "../../DictionaryReader.hpp" +#include "../../DictionaryWriter.hpp" +#include "../../TraceableException.hpp" +#include "Query.hpp" + +namespace clp_s::search::clp_search { +/** + * Class to parse and encode strings into encoded variables and to interpret encoded variables + * back into strings. An encoded variable is one of: i) a variable dictionary ID, referring to + * an entry in the variable dictionary, or ii) a value, representing an integer variable + * exactly as it appears in the original log message, or iii) a value, representing a base-10, + * 16-digit number with a decimal point, where at least one digit is after the decimal point, + * encoded with a custom format. + * + * To decode an encoded variable, the logtype specifies whether the variable is either: + * - i/ii, or + * - iii + * This class differentiates between i & ii by using a certain range of values for variable + * dictionary IDs, and the rest for non-dictionary variables. + * + * We collectively refer to ii & iii as non-dictionary variables. + */ +class EncodedVariableInterpreter { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; + + /** + * Encodes a string-form variable, and if it is dictionary variable, searches for its ID in + * the given variable dictionary + * @param var_str + * @param var_dict + * @param ignore_case + * @param logtype + * @param sub_query + * @return true if variable is a non-dictionary variable or was found in the given variable + * dictionary, false otherwise + */ + static bool encode_and_search_dictionary( + std::string const& var_str, + VariableDictionaryReader const& var_dict, + bool ignore_case, + std::string& logtype, + SubQuery& sub_query + ); + /** + * Search for the given string-form variable in the variable dictionary, encode any matches, + * and add them to the given sub-query + * @param var_wildcard_str + * @param var_dict + * @param ignore_case + * @param sub_query + * @return true if any match found, false otherwise + */ + static bool wildcard_search_dictionary_and_get_encoded_matches( + std::string const& var_wildcard_str, + VariableDictionaryReader const& var_dict, + bool ignore_case, + SubQuery& sub_query + ); + +private: + // Variables + // The beginning of the range used for encoding variable dictionary IDs + static constexpr encoded_variable_t cVarDictIdRangeBegin = 1LL << 62; + // The end (exclusive) of the range used for encoding variable dictionary IDs + static constexpr encoded_variable_t cVarDictIdRangeEnd = (1ULL << 63) - 1; +}; +} // namespace clp_s::search::clp_search + +#endif // CLP_S_SEARCH_CLP_SEARCH_ENCODEDVARIABLEINTERPRETER_HPP diff --git a/components/core/src/clp_s/search/clp_search/Grep.cpp b/components/core/src/clp_s/search/clp_search/Grep.cpp new file mode 100644 index 000000000..54031446a --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/Grep.cpp @@ -0,0 +1,639 @@ +// Code from CLP + +#include "Grep.hpp" + +#include + +#include "../../VariableEncoder.hpp" +#include "EncodedVariableInterpreter.hpp" + +using std::string; +using std::vector; + +namespace clp_s::search::clp_search { +// Local types +enum class SubQueryMatchabilityResult { + MayMatch, // The subquery might match a message + WontMatch, // The subquery has no chance of matching a message + SupercedesAllSubQueries // The subquery will cause all messages to be matched +}; + +// Class representing a token in a query. It is used to interpret a token in user's search +// string. +class QueryToken { +public: + // Constructors + QueryToken(string const& query_string, size_t begin_pos, size_t end_pos, bool is_var); + + // Methods + bool cannot_convert_to_non_dict_var() const; + bool contains_wildcards() const; + bool has_greedy_wildcard_in_middle() const; + bool has_prefix_greedy_wildcard() const; + bool has_suffix_greedy_wildcard() const; + bool is_ambiguous_token() const; + bool is_double_var() const; + bool is_var() const; + bool is_wildcard() const; + + size_t get_begin_pos() const; + size_t get_end_pos() const; + string const& get_value() const; + + bool change_to_next_possible_type(); + +private: + // Types + // Type for the purpose of generating different subqueries. E.g., if a token is of type + // DictOrIntVar, it would generate a different subquery than if it was of type Logtype. + enum class Type { + Wildcard, + // Ambiguous indicates the token can be more than one of the types listed below + Ambiguous, + Logtype, + DictOrIntVar, + DoubleVar + }; + + // Variables + bool m_cannot_convert_to_non_dict_var; + bool m_contains_wildcards; + bool m_has_greedy_wildcard_in_middle; + bool m_has_prefix_greedy_wildcard; + bool m_has_suffix_greedy_wildcard; + + size_t m_begin_pos; + size_t m_end_pos; + string m_value; + + // Type if variable has unambiguous type + Type m_type; + // Types if variable type is ambiguous + vector m_possible_types; + // Index of the current possible type selected for generating a subquery + size_t m_current_possible_type_ix; +}; + +QueryToken::QueryToken( + string const& query_string, + size_t const begin_pos, + size_t const end_pos, + bool const is_var +) + : m_current_possible_type_ix(0) { + m_begin_pos = begin_pos; + m_end_pos = end_pos; + m_value.assign(query_string, m_begin_pos, m_end_pos - m_begin_pos); + + // Set wildcard booleans and determine type + if ("*" == m_value) { + m_has_prefix_greedy_wildcard = true; + m_has_suffix_greedy_wildcard = false; + m_has_greedy_wildcard_in_middle = false; + m_contains_wildcards = true; + m_type = Type::Wildcard; + } else { + m_has_prefix_greedy_wildcard = ('*' == m_value[0]); + m_has_suffix_greedy_wildcard = ('*' == m_value[m_value.length() - 1]); + + m_has_greedy_wildcard_in_middle = false; + for (size_t i = 1; i < m_value.length() - 1; ++i) { + if ('*' == m_value[i]) { + m_has_greedy_wildcard_in_middle = true; + break; + } + } + + m_contains_wildcards + = (m_has_prefix_greedy_wildcard || m_has_suffix_greedy_wildcard + || m_has_greedy_wildcard_in_middle); + + if (false == is_var) { + if (false == m_contains_wildcards) { + m_type = Type::Logtype; + } else { + m_type = Type::Ambiguous; + m_possible_types.push_back(Type::Logtype); + m_possible_types.push_back(Type::DictOrIntVar); + m_possible_types.push_back(Type::DoubleVar); + } + } else { + string value_without_wildcards = m_value; + if (m_has_prefix_greedy_wildcard) { + value_without_wildcards = value_without_wildcards.substr(1); + } + if (m_has_suffix_greedy_wildcard) { + value_without_wildcards.resize(value_without_wildcards.length() - 1); + } + + encoded_variable_t encoded_var; + bool converts_to_non_dict_var = false; + if (VariableEncoder::convert_string_to_representable_integer_var( + value_without_wildcards, + encoded_var + ) + || VariableEncoder::convert_string_to_representable_double_var( + value_without_wildcards, + encoded_var + )) + { + converts_to_non_dict_var = true; + } + + if (false == converts_to_non_dict_var) { + // Dictionary variable + m_type = Type::DictOrIntVar; + m_cannot_convert_to_non_dict_var = true; + } else { + m_type = Type::Ambiguous; + m_possible_types.push_back(Type::DictOrIntVar); + m_possible_types.push_back(Type::DoubleVar); + m_cannot_convert_to_non_dict_var = false; + } + } + } +} + +bool QueryToken::cannot_convert_to_non_dict_var() const { + return m_cannot_convert_to_non_dict_var; +} + +bool QueryToken::contains_wildcards() const { + return m_contains_wildcards; +} + +bool QueryToken::has_greedy_wildcard_in_middle() const { + return m_has_greedy_wildcard_in_middle; +} + +bool QueryToken::has_prefix_greedy_wildcard() const { + return m_has_prefix_greedy_wildcard; +} + +bool QueryToken::has_suffix_greedy_wildcard() const { + return m_has_suffix_greedy_wildcard; +} + +bool QueryToken::is_ambiguous_token() const { + return Type::Ambiguous == m_type; +} + +bool QueryToken::is_double_var() const { + Type type; + if (Type::Ambiguous == m_type) { + type = m_possible_types[m_current_possible_type_ix]; + } else { + type = m_type; + } + return Type::DoubleVar == type; +} + +bool QueryToken::is_var() const { + Type type; + if (Type::Ambiguous == m_type) { + type = m_possible_types[m_current_possible_type_ix]; + } else { + type = m_type; + } + return (Type::DictOrIntVar == type || Type::DoubleVar == type); +} + +bool QueryToken::is_wildcard() const { + return Type::Wildcard == m_type; +} + +size_t QueryToken::get_begin_pos() const { + return m_begin_pos; +} + +size_t QueryToken::get_end_pos() const { + return m_end_pos; +} + +string const& QueryToken::get_value() const { + return m_value; +} + +bool QueryToken::change_to_next_possible_type() { + if (m_current_possible_type_ix < m_possible_types.size() - 1) { + ++m_current_possible_type_ix; + return true; + } else { + m_current_possible_type_ix = 0; + return false; + } +} + +// Local prototypes +/** + * Process a QueryToken that is definitely a variable + * @param query_token + * @param archive + * @param ignore_case + * @param sub_query + * @param logtype + * @return true if this token might match a message, false otherwise + */ +static bool process_var_token( + QueryToken const& query_token, + std::shared_ptr var_dict, /*const Archive& archive,*/ + bool ignore_case, + SubQuery& sub_query, + string& logtype +); +/** + * Finds a message matching the given query + * @param query + * @param archive + * @param matching_sub_query + * @param compressed_file + * @param compressed_msg + * @return true on success, false otherwise + */ +// static bool find_matching_message (const Query& query, Archive& archive, const SubQuery*& +// matching_sub_query, File& compressed_file, Message& compressed_msg); +/** + * Generates logtypes and variables for subquery + * @param archive + * @param processed_search_string + * @param query_tokens + * @param ignore_case + * @param sub_query + * @return SubQueryMatchabilityResult::SupercedesAllSubQueries + * @return SubQueryMatchabilityResult::WontMatch + * @return SubQueryMatchabilityResult::MayMatch + */ +static SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( + std::shared_ptr log_dict, + std::shared_ptr var_dict, /*const Archive& archive,*/ + string& processed_search_string, + vector& query_tokens, + bool ignore_case, + SubQuery& sub_query +); + +static bool process_var_token( + QueryToken const& query_token, + std::shared_ptr var_dict, /*const Archive& archive,*/ + bool ignore_case, + SubQuery& sub_query, + string& logtype +) { + // Even though we may have a precise variable, we still fallback to decompressing to ensure + // that it is in the right place in the message + sub_query.mark_wildcard_match_required(); + + // Create QueryVar corresponding to token + if (false == query_token.contains_wildcards()) { + if (EncodedVariableInterpreter::encode_and_search_dictionary( + query_token.get_value(), + *var_dict, + ignore_case, + logtype, + sub_query + ) + == false) + { + // Variable doesn't exist in dictionary + return false; + } + } else { + if (query_token.has_prefix_greedy_wildcard()) { + logtype += '*'; + } + + if (query_token.is_double_var()) { + LogTypeDictionaryEntry::add_double_var(logtype); + } else { + LogTypeDictionaryEntry::add_non_double_var(logtype); + + if (query_token.cannot_convert_to_non_dict_var()) { + // Must be a dictionary variable, so search variable dictionary + if (!EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( + query_token.get_value(), + *var_dict, + ignore_case, + sub_query + )) + { + // Variable doesn't exist in dictionary + return false; + } + } + } + + if (query_token.has_suffix_greedy_wildcard()) { + logtype += '*'; + } + } + + return true; +} + +SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( + std::shared_ptr log_dict, + std::shared_ptr var_dict, /*const Archive& archive,*/ + string& processed_search_string, + vector& query_tokens, + bool ignore_case, + SubQuery& sub_query +) { + size_t last_token_end_pos = 0; + string logtype; + for (auto const& query_token : query_tokens) { + // Append from end of last token to beginning of this token, to logtype + logtype.append( + processed_search_string, + last_token_end_pos, + query_token.get_begin_pos() - last_token_end_pos + ); + last_token_end_pos = query_token.get_end_pos(); + + if (query_token.is_wildcard()) { + logtype += '*'; + } else if (query_token.has_greedy_wildcard_in_middle()) { + // Fallback to decompression + wildcard matching for now to avoid handling queries + // where the pieces of the token on either side of each wildcard need to be + // processed as ambiguous tokens + sub_query.mark_wildcard_match_required(); + if (false == query_token.is_var()) { + logtype += '*'; + } else { + logtype += '*'; + LogTypeDictionaryEntry::add_non_double_var(logtype); + logtype += '*'; + } + } else { + if (false == query_token.is_var()) { + logtype += query_token.get_value(); + } else if (false == process_var_token(query_token, var_dict, ignore_case, sub_query, logtype)) + { + return SubQueryMatchabilityResult::WontMatch; + } + } + } + + if (last_token_end_pos < processed_search_string.length()) { + // Append from end of last token to end + logtype.append(processed_search_string, last_token_end_pos, string::npos); + last_token_end_pos = processed_search_string.length(); + } + + if ("*" == logtype) { + // Logtype will match all messages + return SubQueryMatchabilityResult::SupercedesAllSubQueries; + } + + // Find matching logtypes + std::unordered_set possible_logtype_entries; + log_dict->get_entries_matching_wildcard_string(logtype, ignore_case, possible_logtype_entries); + if (possible_logtype_entries.empty()) { + return SubQueryMatchabilityResult::WontMatch; + } + sub_query.set_possible_logtypes(possible_logtype_entries); + + // Calculate the IDs of the segments that may contain results for the sub-query now that + // we've calculated the matching logtypes and variables + // TODO: double check that this can be safely ignored for CLJ + // sub_query.calculate_ids_of_matching_segments(); + + return SubQueryMatchabilityResult::MayMatch; +} + +bool Grep::process_raw_query( + std::shared_ptr log_dict, + std::shared_ptr var_dict, /*const Archive& archive,*/ + string const& search_string, /*epochtime_t search_begin_ts, epochtime_t search_end_ts,*/ + bool ignore_case, + Query& query, /* compressor_frontend::lexers::ByteLexer& forward_lexer, + compressor_frontend::lexers::ByteLexer& reverse_lexer,*/ + bool add_wildcards, + bool use_heuristic +) { + // Set properties which require no processing + // query.set_search_begin_timestamp(search_begin_ts); + // query.set_search_end_timestamp(search_end_ts); + query.set_ignore_case(ignore_case); + + // Add prefix and suffix '*' to make the search a sub-string match + string processed_search_string; + if (add_wildcards) { + processed_search_string = "*"; + processed_search_string += search_string; + processed_search_string += '*'; + } else { + processed_search_string = search_string; + } + + // Clean-up search string + processed_search_string = StringUtils::clean_up_wildcard_search_string(processed_search_string); + query.set_search_string(processed_search_string); + + // Replace non-greedy wildcards with greedy wildcards since we currently have no support for + // searching compressed files with non-greedy wildcards + std::replace(processed_search_string.begin(), processed_search_string.end(), '?', '*'); + // Clean-up in case any instances of "?*" or "*?" were changed into "**" + processed_search_string = StringUtils::clean_up_wildcard_search_string(processed_search_string); + + // Split search_string into tokens with wildcards + vector query_tokens; + size_t begin_pos = 0; + size_t end_pos = 0; + bool is_var; + // FIXME: may want to use non-heuristic method of tokenizing query + // if (use_heuristic) { + while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var)) { + query_tokens.emplace_back(processed_search_string, begin_pos, end_pos, is_var); + } + /*} else { + while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, + is_var, forward_lexer, reverse_lexer)) { query_tokens.emplace_back(processed_search_string, + begin_pos, end_pos, is_var); + } + }*/ + + // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since + // we fall-back to decompression + wildcard matching for those. + vector ambiguous_tokens; + for (auto& query_token : query_tokens) { + if (false == query_token.has_greedy_wildcard_in_middle() + && query_token.is_ambiguous_token()) + { + ambiguous_tokens.push_back(&query_token); + } + } + + // Generate a sub-query for each combination of ambiguous tokens + // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we + // need to create: + // - (token1 as logtype) (token2 as logtype) + // - (token1 as logtype) (token2 as var) + // - (token1 as var) (token2 as logtype) + // - (token1 as var) (token2 as var) + SubQuery sub_query; + string logtype; + bool type_of_one_token_changed = true; + while (type_of_one_token_changed) { + sub_query.clear(); + + // Compute logtypes and variables for query + auto matchability = generate_logtypes_and_vars_for_subquery( + log_dict, + var_dict, + processed_search_string, + query_tokens, + query.get_ignore_case(), + sub_query + ); + switch (matchability) { + case SubQueryMatchabilityResult::SupercedesAllSubQueries: + // Clear all sub-queries since they will be superceded by this sub-query + query.clear_sub_queries(); + + // Since other sub-queries will be superceded by this one, we can stop + // processing now + return true; + case SubQueryMatchabilityResult::MayMatch: + query.add_sub_query(sub_query); + break; + case SubQueryMatchabilityResult::WontMatch: + default: + // Do nothing + break; + } + + // Update combination of ambiguous tokens + type_of_one_token_changed = false; + for (auto* ambiguous_token : ambiguous_tokens) { + if (ambiguous_token->change_to_next_possible_type()) { + type_of_one_token_changed = true; + break; + } + } + } + + return query.contains_sub_queries(); +} + +bool Grep::get_bounds_of_next_potential_var( + string const& value, + size_t& begin_pos, + size_t& end_pos, + bool& is_var +) { + auto const value_length = value.length(); + if (end_pos >= value_length) { + return false; + } + + is_var = false; + bool contains_wildcard = false; + while (false == is_var && false == contains_wildcard && begin_pos < value_length) { + // Start search at end of last token + begin_pos = end_pos; + + // Find next wildcard or non-delimiter + bool is_escaped = false; + for (; begin_pos < value_length; ++begin_pos) { + char c = value[begin_pos]; + + if (is_escaped) { + is_escaped = false; + + if (StringUtils::is_delim(c)) { + // Found escaped non-delimiter, so reverse the index to retain the escape + // character + --begin_pos; + break; + } + } else if ('\\' == c) { + // Escape character + is_escaped = true; + } else { + if (StringUtils::is_wildcard(c)) { + contains_wildcard = true; + break; + } + if (false == StringUtils::is_delim(c)) { + break; + } + } + } + + bool contains_decimal_digit = false; + bool contains_alphabet = false; + + // Find next delimiter + is_escaped = false; + end_pos = begin_pos; + for (; end_pos < value_length; ++end_pos) { + char c = value[end_pos]; + + if (is_escaped) { + is_escaped = false; + + if (StringUtils::is_delim(c)) { + // Found escaped delimiter, so reverse the index to retain the escape + // character + --end_pos; + break; + } + } else if ('\\' == c) { + // Escape character + is_escaped = true; + } else { + if (StringUtils::is_wildcard(c)) { + contains_wildcard = true; + } else if (StringUtils::is_delim(c)) { + // Found delimiter that's not also a wildcard + break; + } + } + + if (StringUtils::is_decimal_digit(c)) { + contains_decimal_digit = true; + } else if (StringUtils::is_alphabet(c)) { + contains_alphabet = true; + } + } + + // Treat token as a definite variable if: + // - it contains a decimal digit, or + // - it could be a multi-digit hex value, or + // - it's directly preceded by an equals sign and contains an alphabet without a + // wildcard between the equals sign and the first alphabet of the token + if (contains_decimal_digit + || StringUtils::could_be_multi_digit_hex_value(value, begin_pos, end_pos)) + { + is_var = true; + } else if (begin_pos > 0 && '=' == value[begin_pos - 1] && contains_alphabet) { + // Find first alphabet or wildcard in token + is_escaped = false; + bool found_wildcard_before_alphabet = false; + for (auto i = begin_pos; i < end_pos; ++i) { + auto c = value[i]; + + if (is_escaped) { + is_escaped = false; + + if (StringUtils::is_alphabet(c)) { + break; + } + } else if ('\\' == c) { + // Escape character + is_escaped = true; + } else if (StringUtils::is_wildcard(c)) { + found_wildcard_before_alphabet = true; + break; + } + } + + if (false == found_wildcard_before_alphabet) { + is_var = true; + } + } + } + + return (value_length != begin_pos); +} +} // namespace clp_s::search::clp_search diff --git a/components/core/src/clp_s/search/clp_search/Grep.hpp b/components/core/src/clp_s/search/clp_search/Grep.hpp new file mode 100644 index 000000000..baf5bfcd2 --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/Grep.hpp @@ -0,0 +1,54 @@ +// Code from CLP + +#ifndef CLP_S_SEARCH_CLP_SEARCH_GREP_HPP +#define CLP_S_SEARCH_CLP_SEARCH_GREP_HPP + +#include + +#include "../../Defs.hpp" +#include "../../DictionaryReader.hpp" +#include "Query.hpp" + +namespace clp_s::search::clp_search { +class Grep { +public: + // Methods + /** + * Processes a raw user query into a Query + * @param archive + * @param search_string + * @param search_begin_ts + * @param search_end_ts + * @param ignore_case + * @param query + * @return true if query may match messages, false otherwise + */ + static bool process_raw_query( + std::shared_ptr log_dict, + std::shared_ptr var_dict, + std::string const& search_string, + bool ignore_case, + Query& query, + bool add_wildcards = true, + bool use_heuristic = true + ); + + /** + * Returns bounds of next potential variable (either a definite variable or a token with + * wildcards) + * @param value String containing token + * @param begin_pos Begin position of last token, changes to begin position of next token + * @param end_pos End position of last token, changes to end position of next token + * @param is_var Whether the token is definitely a variable + * @return true if another potential variable was found, false otherwise + */ + static bool get_bounds_of_next_potential_var( + std::string const& value, + size_t& begin_pos, + size_t& end_pos, + bool& is_var + ); +}; +} // namespace clp_s::search::clp_search + +#endif // CLP_S_SEARCH_CLP_SEARCH_GREP_HPP diff --git a/components/core/src/clp_s/search/clp_search/Query.cpp b/components/core/src/clp_s/search/clp_search/Query.cpp new file mode 100644 index 000000000..507d7a0da --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/Query.cpp @@ -0,0 +1,150 @@ +// Code from CLP + +#include "Query.hpp" + +using std::set; +using std::string; +using std::unordered_set; + +namespace clp_s::search::clp_search { +// Local function prototypes +/** + * Performs a set intersection of a & b, storing the result in b + * @tparam SetType + * @param a + * @param b + */ +template +static void inplace_set_intersection(SetType const& a, SetType& b); + +template +static void inplace_set_intersection(SetType const& a, SetType& b) { + for (auto ix = b.cbegin(); ix != b.cend();) { + if (a.count(*ix) == 0) { + ix = b.erase(ix); + } else { + ++ix; + } + } +} + +QueryVar::QueryVar(encoded_variable_t precise_non_dict_var) { + m_precise_var = precise_non_dict_var; + m_is_precise_var = true; + m_is_dict_var = false; + m_var_dict_entry = nullptr; +} + +QueryVar::QueryVar( + encoded_variable_t precise_dict_var, + VariableDictionaryEntry const* var_dict_entry +) { + m_precise_var = precise_dict_var; + m_is_precise_var = true; + m_is_dict_var = true; + m_var_dict_entry = var_dict_entry; +} + +QueryVar::QueryVar( + unordered_set const& possible_dict_vars, + unordered_set const& possible_var_dict_entries +) { + m_is_dict_var = true; + if (possible_dict_vars.size() == 1) { + // A single possible variable is the same as a precise variable + m_precise_var = *possible_dict_vars.cbegin(); + m_is_precise_var = true; + m_var_dict_entry = *possible_var_dict_entries.cbegin(); + } else { + m_possible_dict_vars = possible_dict_vars; + m_is_precise_var = false; + m_possible_var_dict_entries = possible_var_dict_entries; + } +} + +bool QueryVar::matches(encoded_variable_t var) const { + return (m_is_precise_var && m_precise_var == var) + || (false == m_is_precise_var && m_possible_dict_vars.count(var) > 0); +} + +void SubQuery::add_non_dict_var(encoded_variable_t precise_non_dict_var) { + m_vars.emplace_back(precise_non_dict_var); +} + +void SubQuery::add_dict_var( + encoded_variable_t precise_dict_var, + VariableDictionaryEntry const* var_dict_entry +) { + m_vars.emplace_back(precise_dict_var, var_dict_entry); +} + +void SubQuery::add_imprecise_dict_var( + unordered_set const& possible_dict_vars, + unordered_set const& possible_var_dict_entries +) { + m_vars.emplace_back(possible_dict_vars, possible_var_dict_entries); +} + +void SubQuery::set_possible_logtypes( + unordered_set const& logtype_entries +) { + m_possible_logtype_ids.clear(); + + for (auto const* entry : logtype_entries) { + m_possible_logtype_ids.insert(entry->get_id()); + } + m_possible_logtype_entries = logtype_entries; +} + +void SubQuery::mark_wildcard_match_required() { + m_wildcard_match_required = true; +} + +void SubQuery::clear() { + m_vars.clear(); + m_possible_logtype_ids.clear(); + m_wildcard_match_required = false; +} + +bool SubQuery::matches_logtype(logtype_dictionary_id_t const logtype) const { + return m_possible_logtype_ids.count(logtype) > 0; +} + +bool SubQuery::matches_vars(Span vars) const { + if (vars.size() < m_vars.size()) { + // Not enough variables to satisfy query + return false; + } + + // Try to find m_vars in vars, in order, but not necessarily contiguously + size_t possible_vars_ix = 0; + size_t const num_possible_vars = m_vars.size(); + size_t vars_ix = 0; + size_t const num_vars = vars.size(); + while (possible_vars_ix < num_possible_vars && vars_ix < num_vars) { + QueryVar const& possible_var = m_vars[possible_vars_ix]; + + if (possible_var.matches(vars[vars_ix])) { + // Matched + ++possible_vars_ix; + ++vars_ix; + } else { + ++vars_ix; + } + } + return (num_possible_vars == possible_vars_ix); +} + +void Query::set_search_string(string const& search_string) { + m_search_string = search_string; + m_search_string_matches_all = (m_search_string.empty() || "*" == m_search_string); +} + +void Query::add_sub_query(SubQuery const& sub_query) { + m_sub_queries.push_back(sub_query); +} + +void Query::clear_sub_queries() { + m_sub_queries.clear(); +} +} // namespace clp_s::search::clp_search diff --git a/components/core/src/clp_s/search/clp_search/Query.hpp b/components/core/src/clp_s/search/clp_search/Query.hpp new file mode 100644 index 000000000..daba27dcc --- /dev/null +++ b/components/core/src/clp_s/search/clp_search/Query.hpp @@ -0,0 +1,192 @@ +// Code from CLP + +#ifndef CLP_S_SEARCH_CLP_SEARCH_QUERY_HPP +#define CLP_S_SEARCH_CLP_SEARCH_QUERY_HPP + +#include +#include +#include +#include + +#include "../../Defs.hpp" +#include "../../DictionaryEntry.hpp" +#include "../../Utils.hpp" + +namespace clp_s::search::clp_search { +/** + * Class representing a variable in a subquery. It can represent a precise encoded variable or + * an imprecise dictionary variable (i.e., a set of possible encoded dictionary variable IDs) + */ +class QueryVar { +public: + // Constructors + explicit QueryVar(encoded_variable_t precise_non_dict_var); + QueryVar(encoded_variable_t precise_dict_var, VariableDictionaryEntry const* var_dict_entry); + QueryVar( + std::unordered_set const& possible_dict_vars, + std::unordered_set const& possible_var_dict_entries + ); + + // Methods + /** + * Checks if the given encoded variable matches this QueryVar + * @param var + * @return true if matched, false otherwise + */ + bool matches(encoded_variable_t var) const; + + bool is_precise_var() const { return m_is_precise_var; } + + bool is_dict_var() const { return m_is_dict_var; } + + VariableDictionaryEntry const* get_var_dict_entry() const { return m_var_dict_entry; } + + std::unordered_set const& get_possible_var_dict_entries( + ) const { + return m_possible_var_dict_entries; + } + +private: + // Variables + bool m_is_precise_var; + bool m_is_dict_var; + + encoded_variable_t m_precise_var; + // Only used if the precise variable is a dictionary variable + VariableDictionaryEntry const* m_var_dict_entry; + + // Only used if the variable is an imprecise dictionary variable + std::unordered_set m_possible_dict_vars; + std::unordered_set m_possible_var_dict_entries; +}; + +/** + * Class representing a subquery (or informally, an interpretation) of a user query. It contains + * a series of possible logtypes, a set of QueryVars, and whether the query still requires + * wildcard matching after it matches an encoded message. + */ +class SubQuery { +public: + // Methods + /** + * Adds a precise non-dictionary variable to the subquery + * @param precise_non_dict_var + */ + void add_non_dict_var(encoded_variable_t precise_non_dict_var); + /** + * Adds a precise dictionary variable to the subquery + * @param precise_dict_var + * @param var_dict_entry + */ + void add_dict_var( + encoded_variable_t precise_dict_var, + VariableDictionaryEntry const* var_dict_entry + ); + /** + * Adds an imprecise dictionary variable (i.e., a set of possible precise dictionary + * variables) to the subquery + * @param possible_dict_vars + * @param possible_var_dict_entries + */ + void add_imprecise_dict_var( + std::unordered_set const& possible_dict_vars, + std::unordered_set const& possible_var_dict_entries + ); + /** + * Add a set of possible logtypes to the subquery + * @param logtype_entries + */ + void set_possible_logtypes( + std::unordered_set const& logtype_entries + ); + void mark_wildcard_match_required(); + + /** + * Calculates the segment IDs that should contain a match for the subquery's current + * logtypes and QueryVars + */ + // void calculate_ids_of_matching_segments (); + + void clear(); + + bool wildcard_match_required() const { return m_wildcard_match_required; } + + size_t get_num_possible_logtypes() const { return m_possible_logtype_ids.size(); } + + std::unordered_set const& get_possible_logtype_entries() const { + return m_possible_logtype_entries; + } + + size_t get_num_possible_vars() const { return m_vars.size(); } + + std::vector const& get_vars() const { return m_vars; } + + std::set const& get_ids_of_matching_segments() const { + return m_ids_of_matching_segments; + } + + /** + * Whether the given logtype ID matches one of the possible logtypes in this subquery + * @param logtype + * @return true if matched, false otherwise + */ + bool matches_logtype(logtype_dictionary_id_t logtype) const; + /** + * Whether the given variables contain the subquery's variables in order (but not + * necessarily contiguously) + * @param vars + * @return true if matched, false otherwise + */ + bool matches_vars(Span vars) const; + +private: + // Variables + std::unordered_set m_possible_logtype_entries; + std::unordered_set m_possible_logtype_ids; + std::set m_ids_of_matching_segments; + std::vector m_vars; + bool m_wildcard_match_required; +}; + +/** + * Class representing a user query with potentially multiple sub-queries. + */ +class Query { +public: + // Constructors + Query() : m_ignore_case(false), m_search_string_matches_all(true) {} + + void set_ignore_case(bool ignore_case) { m_ignore_case = ignore_case; } + + void set_search_string(std::string const& search_string); + + void add_sub_query(SubQuery const& sub_query); + + void clear_sub_queries(); + + bool get_ignore_case() const { return m_ignore_case; } + + std::string const& get_search_string() const { return m_search_string; } + + /** + * Checks if the search string will match all messages (i.e., it's "" or "*") + * @return true if the search string will match all messages + * @return false otherwise + */ + bool search_string_matches_all() const { return m_search_string_matches_all; } + + std::vector const& get_sub_queries() const { return m_sub_queries; } + + bool contains_sub_queries() const { return m_sub_queries.empty() == false; } + +private: + // Variables + bool m_ignore_case; + std::string m_search_string; + std::vector m_sub_queries; + std::vector m_relevant_sub_queries; + bool m_search_string_matches_all; +}; +} // namespace clp_s::search::clp_search + +#endif // CLP_S_SEARCH_CLP_SEARCH_QUERY_HPP diff --git a/components/core/src/clp_s/search/kql/CMakeLists.txt b/components/core/src/clp_s/search/kql/CMakeLists.txt new file mode 100644 index 000000000..385bd6571 --- /dev/null +++ b/components/core/src/clp_s/search/kql/CMakeLists.txt @@ -0,0 +1,28 @@ +ANTLR_TARGET( + KqlParser + Kql.g4 + LEXER PARSER VISITOR + PACKAGE kql +) + +add_library( + kql + ../../Utils.hpp + ../AndExpr.hpp + ../BooleanLiteral.hpp + ../ColumnDescriptor.hpp + ../DateLiteral.hpp + ../EmptyExpr.hpp + ../Expression.hpp + ../FilterExpr.hpp + ../Integral.hpp + ../NullLiteral.hpp + ../OrExpr.hpp + ../StringLiteral.hpp + ${ANTLR_KqlParser_CXX_OUTPUTS} + kql.cpp + kql.hpp +) +target_compile_features(kql PRIVATE cxx_std_17) +target_include_directories(kql PRIVATE ${ANTLR_KqlParser_OUTPUT_DIR}) +target_link_libraries(kql PRIVATE antlr4_static Boost::filesystem) diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4 new file mode 100644 index 000000000..2649754fa --- /dev/null +++ b/components/core/src/clp_s/search/kql/Kql.g4 @@ -0,0 +1,107 @@ +grammar Kql; + +start: query EOF ; + +query + : col=column ':' '{' q=query '}' #NestedQuery + | '(' q=query ')' #SubQuery + | NOT q=query #NotQuery + | lhs=query op=(OR | AND) rhs=query #OrAndQuery + | expression #Expr + ; + +expression + : column_range_expression + | column_value_expression + | value_expression + ; + +column_range_expression + : col=column RANGE_OPERATOR ( date_lit=DATE_LITERAL | lit=LITERAL ) + ; + +column_value_expression + : col=column ':' ( list=list_of_values | date_lit=DATE_LITERAL | lit=LITERAL ) + ; + +column: + LITERAL + ; + +value_expression + : LITERAL + ; + +list_of_values + : '(' condition=(AND | OR | NOT)? (literals+=LITERAL)* ')' + ; + +AND: [Aa] [Nn] [Dd] ; +OR: [Oo] [Rr] ; +NOT: [Nn] [Oo] [Tt] ; + +DATE_LITERAL: 'date(' (('"' QUOTED_CHARACTER+ '"') | QUOTED_CHARACTER+) ')' ; + +LITERAL: QUOTED_STRING | UNQUOTED_LITERAL ; + +QUOTED_STRING: '"' QUOTED_CHARACTER* '"' ; + +UNQUOTED_LITERAL: UNQUOTED_CHARACTER+ ; + +// TODO handle unicode +fragment QUOTED_CHARACTER + : ESCAPED_SPACE + | '\\"' + | ~'"' + ; + +// TODO: handle unicode +fragment UNQUOTED_CHARACTER + : ESCAPED_SPACE + | ESCAPED_SPECIAL_CHARACTER + | ESCAPED_KEYWORD + | WILDCARD + | ~[\\():<>"{} \r\n\t] + ; + +fragment WILDCARD: '*'; + +// TODO: unescape keywords +fragment ESCAPED_KEYWORD + : '\\' KEYWORD + ; + +fragment KEYWORD + : AND + | OR + | NOT + ; + + +RANGE_OPERATOR + : '<=' + | '>=' + | '<' + | '>' + ; + +fragment ESCAPED_SPECIAL_CHARACTER + : '\\' SPECIAL_CHARACTER + ; + +fragment ESCAPED_SPACE + : '\\t' + | '\\r' + | '\\n' + ; + +fragment SPECIAL_CHARACTER + : [\\():<>"*{}] + ; + + +// For unicode hex +//UNICODE: 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; +//fragment HEXDIGIT: [0-9a-fA-F]+ ; + +SPACE: [ \t\r\n] -> skip ; diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp new file mode 100644 index 000000000..52dc4603b --- /dev/null +++ b/components/core/src/clp_s/search/kql/kql.cpp @@ -0,0 +1,248 @@ +#include +#include +#include + +#include + +#include "KqlBaseVisitor.h" +#include "KqlLexer.h" +#include "KqlParser.h" +// If redlining may want to add ${workspaceFolder}/build/** +// to include path for vscode C/C++ utils + +#include "../../Utils.hpp" +#include "../AndExpr.hpp" +#include "../BooleanLiteral.hpp" +#include "../ColumnDescriptor.hpp" +#include "../DateLiteral.hpp" +#include "../EmptyExpr.hpp" +#include "../FilterExpr.hpp" +#include "../Integral.hpp" +#include "../NullLiteral.hpp" +#include "../OrExpr.hpp" +#include "../StringLiteral.hpp" + +using namespace antlr4; +using namespace kql; + +namespace clp_s::search::kql { +class ErrorListener : public BaseErrorListener { +private: + bool m_error = false; + +public: + void syntaxError( + Recognizer* recognizer, + Token* offending_symbol, + size_t line, + size_t char_position_in_line, + std::string const& msg, + std::exception_ptr e + ) override { + m_error = true; + } + + bool error() const { return m_error; } +}; + +class ParseTreeVisitor : public KqlBaseVisitor { +private: + static void + prepend_column(std::shared_ptr const& desc, DescriptorList const& prefix) { + desc->get_descriptor_list().insert(desc->descriptor_begin(), prefix.begin(), prefix.end()); + } + + void prepend_column(std::shared_ptr const& expr, DescriptorList const& prefix) { + for (auto const& op : expr->get_op_list()) { + if (auto col = std::dynamic_pointer_cast(op)) { + prepend_column(col, prefix); + } else if (auto subexpr = std::dynamic_pointer_cast(op)) { + prepend_column(subexpr, prefix); + } + } + } + +public: + static std::string unquote_string(std::string const& text) { + if (text.at(0) == '"') { + return text.substr(1, text.length() - 2); + } else { + return text; + } + } + + static std::string unquote_date_string(std::string const& text) { + // date(...) + // 012345 + return unquote_string(text.substr(5, text.size() - 6)); + } + + static std::shared_ptr unquote_literal(std::string const& text) { + std::string token = unquote_string(text); + + if (auto ret = Integral::create_from_string(token)) { + return ret; + } else if (auto ret = BooleanLiteral::create_from_string(token)) { + return ret; + } else if (auto ret = NullLiteral::create_from_string(token)) { + return ret; + } else { + return StringLiteral::create(StringUtils::clean_up_wildcard_search_string(token)); + } + } + + static std::shared_ptr unquote_date_literal(std::string const& text) { + std::string token = unquote_date_string(text); + + return DateLiteral::create_from_string(token); + } + + std::any visitStart(KqlParser::StartContext* ctx) override { + // only go through first child (query) and avoid default + // behaviour of returning result from last child (EOF in this case) + return ctx->children[0]->accept(this); + } + + std::any visitColumn(KqlParser::ColumnContext* ctx) override { + std::string column = unquote_string(ctx->LITERAL()->getText()); + + std::vector descriptor_tokens; + StringUtils::tokenize_column_descriptor(column, descriptor_tokens); + + return ColumnDescriptor::create(descriptor_tokens); + } + + std::any visitNestedQuery(KqlParser::NestedQueryContext* ctx) override { + auto descriptor = std::any_cast>(ctx->col->accept(this)); + DescriptorList prefix = descriptor->get_descriptor_list(); + + auto nested_expr = std::any_cast>(ctx->q->accept(this)); + prepend_column(nested_expr, prefix); + + return nested_expr; + } + + std::any visitOrAndQuery(KqlParser::OrAndQueryContext* ctx) override { + auto lhs = std::any_cast>(ctx->lhs->accept(this)); + auto rhs = std::any_cast>(ctx->rhs->accept(this)); + if (ctx->op->getType() == KqlParser::AND) { + return AndExpr::create(lhs, rhs); + } else { + return OrExpr::create(lhs, rhs); + } + } + + std::any visitNotQuery(KqlParser::NotQueryContext* ctx) override { + auto q = std::any_cast>(ctx->q->accept(this)); + q->invert(); + return q; + } + + std::any visitSubQuery(KqlParser::SubQueryContext* ctx) override { + return ctx->q->accept(this); + } + + std::any visitColumn_value_expression(KqlParser::Column_value_expressionContext* ctx) override { + auto descriptor = std::any_cast>(ctx->col->accept(this)); + + if (ctx->lit) { + auto lit = unquote_literal(ctx->lit->getText()); + return FilterExpr::create(descriptor, FilterOperation::EQ, lit); + } else if (ctx->date_lit) { + auto lit = unquote_date_literal(ctx->date_lit->getText()); + return FilterExpr::create(descriptor, FilterOperation::EQ, lit); + } else /*if (ctx->list) */ { + auto list_expr = std::any_cast>(ctx->list->accept(this)); + DescriptorList prefix = descriptor->get_descriptor_list(); + prepend_column(list_expr, prefix); + return list_expr; + } + } + + std::any visitColumn_range_expression(KqlParser::Column_range_expressionContext* ctx) override { + auto descriptor = std::any_cast>(ctx->col->accept(this)); + std::shared_ptr lit; + if (ctx->lit) { + lit = unquote_literal(ctx->lit->getText()); + } else /*if (ctx->date_lit)*/ { + lit = unquote_date_literal(ctx->date_lit->getText()); + } + std::string range_op = ctx->RANGE_OPERATOR()->getText(); + + FilterOperation op = FilterOperation::EQ; + if (range_op == "<=") { + op = FilterOperation::LTE; + } else if (range_op == ">=") { + op = FilterOperation::GTE; + } else if (range_op == "<") { + op = FilterOperation::LT; + } else if (range_op == ">") { + op = FilterOperation::GT; + } + + return FilterExpr::create(descriptor, op, lit); + } + + std::any visitValue_expression(KqlParser::Value_expressionContext* ctx) override { + auto lit = unquote_literal(ctx->LITERAL()->getText()); + auto descriptor = ColumnDescriptor::create("*"); + return FilterExpr::create(descriptor, FilterOperation::EQ, lit); + } + + std::any visitList_of_values(KqlParser::List_of_valuesContext* ctx) override { + std::shared_ptr base(nullptr); + bool invert_each_filter = false; + if (ctx->condition) { + if (ctx->AND()) { + base = AndExpr::create(); + } else if (ctx->OR()) { + base = OrExpr::create(); + } else if (ctx->NOT()) { + invert_each_filter = true; + base = AndExpr::create(); + } + } else { + base = OrExpr::create(); + } + + auto empty_descriptor = ColumnDescriptor::create(DescriptorList()); + for (auto token : ctx->literals) { + auto literal = unquote_literal(token->getText()); + auto expr = FilterExpr::create( + empty_descriptor, + FilterOperation::EQ, + literal, + invert_each_filter + ); + base->add_operand(expr); + } + return base; + } +}; + +std::shared_ptr parse_kql_expression(std::istream& in) { + std::shared_ptr expr = EmptyExpr::create(); + ErrorListener lexer_error_listener; + ErrorListener parser_error_listener; + + ANTLRInputStream input(in); + KqlLexer lexer(&input); + lexer.addErrorListener(&lexer_error_listener); + CommonTokenStream tokens(&lexer); + KqlParser parser(&tokens); + parser.addErrorListener(&parser_error_listener); + KqlParser::StartContext* tree = parser.start(); + + if (lexer_error_listener.error()) { + std::cout << "Lexer Error" << std::endl; + return expr; + } else if (parser_error_listener.error()) { + std::cout << "Parser Error" << std::endl; + return expr; + } + + ParseTreeVisitor visitor; + expr = std::any_cast>(visitor.visitStart(tree)); + return expr; +} +} // namespace clp_s::search::kql diff --git a/components/core/src/clp_s/search/kql/kql.hpp b/components/core/src/clp_s/search/kql/kql.hpp new file mode 100644 index 000000000..ce74157fb --- /dev/null +++ b/components/core/src/clp_s/search/kql/kql.hpp @@ -0,0 +1,17 @@ +#ifndef CLP_S_SEARCH_KQL_KQL_HPP +#define CLP_S_SEARCH_KQL_KQL_HPP + +#include + +#include "../Expression.hpp" + +namespace clp_s::search::kql { +/** + * Generate a search AST from a Kibana expression in an input stream + * @param in input stream containing a Kibana expression followed by EOF + * @return a search AST + */ +std::shared_ptr parse_kql_expression(std::istream& in); +} // namespace clp_s::search::kql + +#endif // CLP_S_SEARCH_KQL_KQL_HPP diff --git a/components/core/submodules/abseil-cpp b/components/core/submodules/abseil-cpp new file mode 160000 index 000000000..fb3621f4f --- /dev/null +++ b/components/core/submodules/abseil-cpp @@ -0,0 +1 @@ +Subproject commit fb3621f4f897824c0dbe0615fa94543df6192f30 diff --git a/components/core/submodules/simdjson b/components/core/submodules/simdjson new file mode 160000 index 000000000..6060be2fd --- /dev/null +++ b/components/core/submodules/simdjson @@ -0,0 +1 @@ +Subproject commit 6060be2fdf62edf4a8f51a8b0883d57d09397b30 diff --git a/components/core/tools/scripts/deps-download/abseil-cpp.json b/components/core/tools/scripts/deps-download/abseil-cpp.json new file mode 100644 index 000000000..e38bf8bdb --- /dev/null +++ b/components/core/tools/scripts/deps-download/abseil-cpp.json @@ -0,0 +1,10 @@ +{ + "url": "https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.1.zip", + "unzip": true, + "targets": [ + { + "source": "abseil-cpp-20230802.1", + "destination": "submodules/abseil-cpp" + } + ] +} diff --git a/components/core/tools/scripts/deps-download/antlr4.json b/components/core/tools/scripts/deps-download/antlr4.json new file mode 100644 index 000000000..ff0d4d871 --- /dev/null +++ b/components/core/tools/scripts/deps-download/antlr4.json @@ -0,0 +1,14 @@ +{ + "url": "https://www.antlr.org/download/antlr-4.13.1-complete.jar", + "unzip": false, + "hash": { + "algo": "sha3_256", + "digest": "292ba55b3be8443777737e94841cff7a343e7067747c2cb6f58830797b20be65" + }, + "targets": [ + { + "source": "antlr-4.13.1-complete.jar", + "destination": "third-party/antlr/antlr-4.13.1-complete.jar" + } + ] +} diff --git a/components/core/tools/scripts/deps-download/download-all.sh b/components/core/tools/scripts/deps-download/download-all.sh index 3afd60536..ded2b2612 100755 --- a/components/core/tools/scripts/deps-download/download-all.sh +++ b/components/core/tools/scripts/deps-download/download-all.sh @@ -13,14 +13,17 @@ mkdir -p submodules # We don't use a git submodule for sqlite3 since that would require building the # sqlite amalgamation python3 "${script_dir}/download-dep.py" "${script_dir}/sqlite3.json" +python3 "${script_dir}/download-dep.py" "${script_dir}/antlr4.json" if [ -e "$project_root_dir/.git" ] ; then git submodule update --init --recursive else + python3 "${script_dir}/download-dep.py" "${script_dir}/abseil-cpp.json" python3 "${script_dir}/download-dep.py" "${script_dir}/boost-outcome.json" python3 "${script_dir}/download-dep.py" "${script_dir}/Catch2.json" python3 "${script_dir}/download-dep.py" "${script_dir}/date.json" python3 "${script_dir}/download-dep.py" "${script_dir}/json.json" python3 "${script_dir}/download-dep.py" "${script_dir}/log-surgeon.json" + python3 "${script_dir}/download-dep.py" "${script_dir}/simdjson.json" python3 "${script_dir}/download-dep.py" "${script_dir}/yaml-cpp.json" fi diff --git a/components/core/tools/scripts/deps-download/simdjson.json b/components/core/tools/scripts/deps-download/simdjson.json new file mode 100644 index 000000000..8b9999961 --- /dev/null +++ b/components/core/tools/scripts/deps-download/simdjson.json @@ -0,0 +1,11 @@ +{ + "url": "https://github.com/simdjson/simdjson/archive/refs/tags/v3.6.3.zip", + "unzip": true, + "targets": [ + { + "source": "simdjson-3.6.3", + "destination": "submodules/simdjson" + } + ] +} + diff --git a/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh index 53ce6dc94..e9398083b 100755 --- a/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh @@ -3,6 +3,7 @@ yum install -y \ bzip2 \ centos-release-scl \ + java-11-openjdk \ make \ openssl-devel \ openssl-static \ diff --git a/components/core/tools/scripts/lib_install/macos-12/install-all.sh b/components/core/tools/scripts/lib_install/macos-12/install-all.sh index d49b6ee7a..7bac11b54 100755 --- a/components/core/tools/scripts/lib_install/macos-12/install-all.sh +++ b/components/core/tools/scripts/lib_install/macos-12/install-all.sh @@ -6,6 +6,7 @@ brew install \ cmake \ fmt \ gcc \ + java11 \ libarchive \ lz4 \ mariadb-connector-c \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index 01bc0d321..1fab1ccd9 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -15,6 +15,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y \ libboost-program-options-dev \ libmariadb-dev \ libssl-dev \ + openjdk-11-jdk \ pkg-config \ python3 \ python3-pip \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index 9a6125a8b..ab8382fdc 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -13,6 +13,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y \ libboost-program-options-dev \ libmariadb-dev \ libssl-dev \ + openjdk-11-jdk \ pkg-config \ python3 \ python3-pip \ diff --git a/docs/core/clp-structured.md b/docs/core/clp-structured.md new file mode 100644 index 000000000..4c919d7f1 --- /dev/null +++ b/docs/core/clp-structured.md @@ -0,0 +1,125 @@ +# Using CLP for semi-structured logs + +For semi-structured logs (e.g., JSON), you can compress, decompress, and search them using the +`clp-s` binary described below. + +## Contents + +* [Compression](#compression) +* [Decompression](#decompression) +* [Search](#search) +* [Current limitations](#current-limitations) + +## Compression + +Usage: + +```shell +./clp-s c [] [ ...] +``` + +* `archives-dir` is the directory that archives should be written to. +* `input-path` is any new-line-delimited JSON (ndjson) log file or directory containing such files. +* `options` allow you to specify things like which field should be considered as the log event's + timestamp (`--timestamp-key `). + * For a complete list, run `./clp-s c --help` + +### Examples + +**Compress `/mnt/logs/log1.json` and output archives to `/mnt/data/archives1`:** + +```bash +./clp-s c /mnt/data/archives1 /mnt/logs/log1.json +``` + +**Treat the field `{"d": {"@timestamp": "..."}}` as each log event's timestamp:** + +```bash +./clp-s c --timestamp-key 'd.@timestamp' /mnt/data/archives1 /mnt/logs/log1.json +``` + +> [!TIP] +> Specifying the timestamp-key will create a range-index for the timestamp column which can increase +> compression ratio and search performance. + +**Set the target encoded size to 1 GiB and the compression level to 6 (3 by default)** + +```bash +./clp-s c \ + --target-encoded-size 1073741824 \ + --compression-level 6 \ + /mnt/data/archives1 \ + /mnt/logs/log1.json +``` + +## Decompression + +Usage: + +```bash +./clp-s x +``` + +* `archives-dir` is a directory containing archives. +* `output-dir` is the directory that decompressed logs should be written to. + +### Examples + +**Decompress all logs from `/mnt/data/archives1` into `/mnt/data/archives1-decomp`:** + +```bash +./clp-s x /mnt/data/archives1 /mnt/data/archives1-decomp +``` + +## Search + +Usage: + +```bash +./clp-s s +``` + +* `archives-dir` is a directory containing archives. +* `kql-query` is a [KQL][1] query. + +### Examples + +**Find all log events within a time range:** + +```bash +./clp-s s /mnt/data/archives1 'ts >= 1649923037 AND ts <= 1649923038' +``` +or +```bash +./clp-s s /mnt/data/archives1 \ + 'ts >= date("2022-04-14T07:57:17") AND ts <= date("2022-04-14T07:57:18")' +``` + +**Find log events with a given key-value pair:** + +```bash +./clp-s s /mnt/data/archives1 'id: 22149' +``` + +**Find ERROR log events containing a substring:** + +```bash +./clp-s s /mnt/data/archives1 'level: ERROR AND message: "job*"' +``` + +**Find both FATAL and ERROR log events:** + +```bash +./clp-s s /mnt/data/archives1 'level: FATAL OR level: ERROR' +``` + +## Current limitations + +* `clp-s` currently only supports *valid* ndjson logs; it does not handle ndjson logs with trailing + commas or other JSON syntax errors. +* Time zone information is not preserved. +* The order of log events is not preserved. +* The input directory structure is not preserved and during decompression all files are written to + the same file. + +[1]: https://www.elastic.co/guide/en/kibana/current/kuery-query.html diff --git a/docs/core/clp-unstructured.md b/docs/core/clp-unstructured.md new file mode 100644 index 000000000..56613e799 --- /dev/null +++ b/docs/core/clp-unstructured.md @@ -0,0 +1,157 @@ +# Using CLP for unstructured logs + +For unstructured (plain text) logs, you can compress, decompress, and search them using the `clp` +and `clg` binaries described below. + +## Contents + +* [Compression](#compression) +* [Decompression](#decompression) +* [Search](#search) +* [Parallel compression](#parallel-compression) +* [Utilities](#utilities) + * [`make-dictionaries-readable`](#make-dictionaries-readable) + +## Compression + +### `clp` + +Usage: + +```shell +./clp c [] [ ...] +``` + +* `archives-dir` is the directory that archives should be written to. + * `clp` will create a number of files and directories within, so it's best if this directory is + empty. + * You can use the same directory repeatedly and `clp` will add to the compressed logs within. +* `input-path` is any plain-text log file or directory containing such files. +* `options` allow you to specify things like a path to a custom + [schema](../../components/core/README-Schema.md) file (`--schema-path `). + * For a complete list, run `./clp c --help` + +### Examples + +**Compress `/mnt/logs/log1.log` and output archives to `/mnt/data/archives1`:** + +```shell +./clp c /mnt/data/archives1 /mnt/logs/log1.log +``` + +**Compress `/mnt/logs/log1.log` using a custom schema specified in `/mnt/conf/schemas.txt`:** + +```shell +./clp c --schema-path /mnt/conf/schemas.txt /mnt/data/archives1 /mnt/logs/log1.log +``` + +## Decompression + +Usage: + +```shell +./clp x [] [] +``` + +* `archives-dir` is a directory containing archives. +* `output-dir` is the directory that decompressed logs should be written to. +* `file-path` is an optional file path to decompress, in particular. + +### Examples + +**Decompress all logs from `/mnt/data/archives1` into `/mnt/data/archives1-decomp`:** + +```shell +./clp x /mnt/data/archives1 /mnt/data/archives1-decomp +``` + +**Decompress just `/mnt/logs/file1.log`:** + +```shell +./clp x /mnt/data/archives1 /mnt/data/archives1 /mnt/logs/file1.log +``` + +## Search + +Usage: + +> [!NOTE] +> Search uses a different executable (`clg`) than compression (`clp`). + +```shell +./clg [] [] +``` + +* `archives-dir` is a directory containing archives. +* `wildcard-query` is a wildcard query where: + * the `*` wildcard matches 0 or more characters; + * the `?` wildcard matches any single character. +* `options` allow you to specify things like a time-range filter. + * For a complete list, run `./clg --help` + +### Examples + +**Search `/mnt/data/archives1` for specific ERROR logs:** + +```shell +./clg /mnt/data/archives1 " ERROR * container " +``` + +**Search for logs in a time range:** + +```shell +./clg /mnt/data/archives1 --tge 1546344654321 --tle 1546344912345 " user1 " +``` + +> [!NOTE] +> Currently, timestamps must be specified as milliseconds since the UNIX epoch. + +**Search a single file**: + +```shell +./clg /mnt/data/archives1 " session closed " /mnt/logs/file1 +``` + +# Parallel Compression + +By default, `clp` uses an embedded SQLite database, so each directory containing archives can only +be accessed by a single `clp` instance. + +To enable parallel compression to the same archives directory, `clp`/`clg` can be configured to use +a MySQL-type database (e.g., MariaDB) as follows: + +* Install and configure MariaDB using the instructions for your platform +* Create a user that has privileges to create databases, create tables, insert records, and delete + records. +* Copy and change `config/metadata-db.yml`, setting the type to `mysql` and uncommenting the MySQL + parameters. +* Install the MariaDB and PyYAML Python packages `pip3 install mariadb PyYAML` + * This is necessary to run the database initialization script. If you prefer, you can run the SQL + statements in `tools/scripts/db/init-db.py` directly. +* Run `tools/scripts/db/init-db.py` with the updated config file. This will initialize the database + CLP requires. +* Run `clp` or `clg` as before, with the addition of the `--db-config-file` option pointing at the + updated config file. +* To compress in parallel, simply run another instance of `clp` concurrently. + +Note that currently, decompression (`clp x`) and search (`clg`) can only be run with a single +instance. We are in the process of open-sourcing parallelized versions of these as well. + +# Utilities + +Below are utilities for working with CLP archives. + +## `make-dictionaries-readable` + +To convert the dictionaries of an individual archive into a human-readable form, you can use +`make-dictionaries-readable`. + +```shell +./make-dictionaries-readable archive-path +``` + +* `archive-path` is a path to a specific archive (inside `archives-dir`) + +See the `make-dictionaries-readable` +[README](../../components/core/src/clp/make_dictionaries_readable/README.md) for details on the +output format.