From d81b7cc00a9de1094e0424e3d95c4c5749ebb284 Mon Sep 17 00:00:00 2001 From: syncpark Date: Tue, 6 Sep 2022 11:07:03 +0900 Subject: [PATCH] Change cli program to see debug log of REconverge 0.31.0 --- CHANGELOG.md | 57 +- Cargo.lock | 972 +++----------------------------- Cargo.toml | 24 +- src/bin/labeler.rs | 113 ---- src/bin/labtune.rs | 519 ----------------- src/cluster.rs | 561 +++++++------------ src/clustermap.rs | 20 - src/config.rs | 149 +++++ src/database.rs | 827 ---------------------------- src/dict.rs | 102 ---- src/events.rs | 539 +++--------------- src/labelmap.rs | 734 ------------------------ src/labels.rs | 603 ++++---------------- src/lib.rs | 370 ++++--------- src/{bin/cli.rs => main.rs} | 134 ++--- src/matcher.rs | 1040 ++++++----------------------------- src/parser.rs | 155 ++++++ src/threat_description.rs | 242 -------- src/tidb.rs | 135 +++++ src/weblog_parser.rs | 461 ---------------- 20 files changed, 1257 insertions(+), 6500 deletions(-) delete mode 100644 src/bin/labeler.rs delete mode 100644 src/bin/labtune.rs delete mode 100644 src/clustermap.rs create mode 100644 src/config.rs delete mode 100644 src/database.rs delete mode 100644 src/dict.rs delete mode 100644 src/labelmap.rs rename src/{bin/cli.rs => main.rs} (80%) create mode 100644 src/parser.rs delete mode 100644 src/threat_description.rs create mode 100644 src/tidb.rs delete mode 100644 src/weblog_parser.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index f4a6a58..8accf14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,59 +6,10 @@ project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Added +### [Removed] +- labeler, labtune program is removed -## [0.1.2] - 2020-12-22 +### [Changed] -### Added -- labeler - - set tokens enable/disable automatically by it's benign/suspicious cluster matching ratio - - all(enabled+disabled) patterns are matched and saved it's usage statistics - - only enabled patterns are calculated as score -- labtune - - Added - import/export labels from/to threat description database(.tdb). - - Added command - `/export`, `/import [force] <.tdb file>` - -### Changed - - labtune - load/export for each thread description file in '.json' format. - - -## [0.1.1] - 2020-12-10 - -### Added -- labtune - - Added command - `/add keyword|signature`, `/remove keyword|signature`, `/remove all` -- labeler - - labeling by keyword and signature matching result. - - weight for each type of keyword, signature, token and scoring. - - default weight: 1.0 for token, 10.0 for keyword, signature -- labtune, cli: score and probability - - label scoring and probability based on score ranage - - probability: VeryLoW, Log, Medium, High, VeryHigh - -### Changed -- labtune - - `/status` command shows the keywords and signature usage too. - - `/status`, `/label` commands show the type of pattern in usage statistics too. -- database schema changed to store pattern id for each token/signature/keywords. - -## [0.1.0] - 2020-11-30 - -### Added -- cli: Added command - `/filter diff`, `/set [all]` -- cli: Added command - `/set tokens on|off` -- labtune: Added command - `/filter token` - -### Changed - -- labtune: Changed `/tokens` command to `/status` -- labtune: Show matched clusters in the `#` -- labeler: The unknown clusters can be qualified by the ipaddrs from previous suspicious and benign clusters. -- cli, labtune, labeler: Requires reconverge 0.25, review 0.10 or later -- cli, labeler: Changed web log parser and tokenizer -- cli: `/save` command will save all updated qualifiers to `cluster` table - -### Removed -- cli: `/r`, `/reverse` command removed -- cli: `/filter auto ` command removed \ No newline at end of file +- This cli program is modified to see the result of REconverge clustering diff --git a/Cargo.lock b/Cargo.lock index 5cf0e4e..a9fa29a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "aho-corasick" version = "0.7.18" @@ -44,17 +50,6 @@ dependencies = [ "nodrop", ] -[[package]] -name = "async-trait" -version = "0.1.51" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44318e776df68115a881de9a8fd1b9e53368d7a4a5ce4cc48517da3393233a5e" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "atty" version = "0.2.14" @@ -73,20 +68,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "bigdecimal" -version = "0.2.2" +name = "bincode" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1e50562e37200edf7c6c43e54a08e64a5553bfb59d9c297d5572512aa517256" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" dependencies = [ - "num-bigint 0.3.3", - "num-integer", - "num-traits", + "serde", ] [[package]] @@ -95,39 +82,18 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array", -] - -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "bytes" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" - -[[package]] -name = "cargo-emit" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d1d6b8077d27443822a547d1ef816eadd2dc4a75de9105aff614192729cf6d3" - [[package]] name = "cc" version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.0" @@ -163,114 +129,21 @@ dependencies = [ ] [[package]] -name = "clipboard-win" -version = "4.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4ea1881992efc993e4dc50a324cdbd03216e41bdc8385720ff47efc9bd2ca8" -dependencies = [ - "error-code", - "str-buf", - "winapi", -] - -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - -[[package]] -name = "cpufeatures" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" -dependencies = [ - "libc", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" -dependencies = [ - "cfg-if", - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" -dependencies = [ - "cfg-if", - "lazy_static", -] - -[[package]] -name = "crypto-mac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" -dependencies = [ - "generic-array", - "subtle", -] - -[[package]] -name = "derive_more" -version = "0.99.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40eebddd2156ce1bb37b20bbe5151340a31828b1f2d22ba4141f3531710e38df" -dependencies = [ - "convert_case", - "proc-macro2", - "quote", - "rustc_version 0.3.3", - "syn", -] - -[[package]] -name = "digest" -version = "0.9.0" +name = "crc32fast" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "generic-array", + "cfg-if 1.0.0", ] [[package]] name = "dirs-next" -version = "2.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +checksum = "cf36e65a80337bea855cd4ef9b8401ffce06a7baedf2e85ec467b1ac3f6e82b6" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "dirs-sys-next", ] @@ -291,17 +164,11 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" -[[package]] -name = "endian-type" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" - [[package]] name = "env_logger" -version = "0.8.4" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" +checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" dependencies = [ "atty", "humantime", @@ -311,160 +178,13 @@ dependencies = [ ] [[package]] -name = "error-code" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" -dependencies = [ - "libc", - "str-buf", -] - -[[package]] -name = "fallible-iterator" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" - -[[package]] -name = "fd-lock" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0010f02effd88c702318c5dde0463206be67495d0b4d906ba7c0a8f166cc7f06" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "foreign-types" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" -dependencies = [ - "foreign-types-macros", - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-macros" -version = "0.2.1" +name = "flate2" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63f713f8b2aa9e24fec85b0e290c56caee12e3b6ae0aeeda238a75b28251afd6" +checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "foreign-types-shared" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7684cf33bb7f28497939e8c7cf17e3e4e3b8d9a0080ffa4f8ae2f515442ee855" - -[[package]] -name = "futures" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d" - -[[package]] -name = "futures-executor" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377" - -[[package]] -name = "futures-macro" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb" -dependencies = [ - "autocfg", - "proc-macro-hack", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11" - -[[package]] -name = "futures-task" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99" - -[[package]] -name = "futures-util" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481" -dependencies = [ - "autocfg", - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "generic-array" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" -dependencies = [ - "typenum", - "version_check", + "crc32fast", + "miniz_oxide", ] [[package]] @@ -473,7 +193,7 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ - "cfg-if", + "cfg-if 1.0.0", "libc", "wasi", ] @@ -493,6 +213,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -502,62 +228,12 @@ dependencies = [ "libc", ] -[[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac", - "digest", -] - [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" -[[package]] -name = "hyperscan" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbac3bd0a6a6f0154a2f4bb14e6975261d8fc4a94955085eb3dbd241582447f5" -dependencies = [ - "anyhow", - "bitflags", - "cfg-if", - "derive_more", - "foreign-types", - "hyperscan-sys", - "libc", - "malloc_buf", - "rustc_version 0.4.0", - "semver 1.0.4", - "thiserror", -] - -[[package]] -name = "hyperscan-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8874e95864fae528574040a4c46f45026454967de52d150a23e5ff179733bc" -dependencies = [ - "anyhow", - "cargo-emit", - "libc", - "pkg-config", -] - -[[package]] -name = "instant" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d" -dependencies = [ - "cfg-if", -] - [[package]] name = "itertools" version = "0.10.1" @@ -575,15 +251,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "labeler" -version = "0.1.2" +version = "0.1.4" dependencies = [ "ansi_term 0.12.1", "anyhow", - "bigdecimal", + "bincode", "chrono", "env_logger", + "flate2", "glob", - "hyperscan", "itertools", "log", "num", @@ -592,14 +268,13 @@ dependencies = [ "num-traits", "num_cpus", "percent-encoding", - "postgres", - "rayon", "regex", "rustyline", "rustyline-derive", "serde", "serde_json", "structopt", + "strum", "threadpool", ] @@ -615,42 +290,13 @@ version = "0.2.101" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" -[[package]] -name = "lock_api" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" -dependencies = [ - "scopeguard", -] - [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if", -] - -[[package]] -name = "malloc_buf" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f32c8c0575eee8637bf462087c00098fe16d6cb621f1abb6ebab4da414d57fd" -dependencies = [ - "libc", -] - -[[package]] -name = "md-5" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" -dependencies = [ - "block-buffer", - "digest", - "opaque-debug", + "cfg-if 1.0.0", ] [[package]] @@ -660,56 +306,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" [[package]] -name = "memoffset" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" -dependencies = [ - "autocfg", -] - -[[package]] -name = "mio" -version = "0.7.13" +name = "miniz_oxide" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" +checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" dependencies = [ - "libc", - "log", - "miow", - "ntapi", - "winapi", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi", -] - -[[package]] -name = "nibble_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" -dependencies = [ - "smallvec", + "adler", ] [[package]] name = "nix" -version = "0.20.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df8e5e343312e7fbeb2a52139114e9e702991ef9c2aea6817ff2440b35647d56" +checksum = "83450fe6a6142ddd95fb064b746083fc4ef1705fe81f64a64e1d4b39f54a1055" dependencies = [ "bitflags", "cc", - "cfg-if", + "cfg-if 0.1.10", "libc", - "memoffset", ] [[package]] @@ -718,22 +332,13 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" -[[package]] -name = "ntapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" -dependencies = [ - "winapi", -] - [[package]] name = "num" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" dependencies = [ - "num-bigint 0.4.2", + "num-bigint", "num-complex", "num-integer", "num-iter", @@ -741,17 +346,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-bigint" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6f7833f2cbf2360a6cfd58cd41a53aa7a90bd4c202f5b1c7dd2ed73c57b2c3" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - [[package]] name = "num-bigint" version = "0.4.2" @@ -821,7 +415,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" dependencies = [ "autocfg", - "num-bigint 0.4.2", + "num-bigint", "num-integer", "num-traits", ] @@ -845,137 +439,12 @@ dependencies = [ "libc", ] -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall", - "smallvec", - "winapi", -] - [[package]] name = "percent-encoding" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" -[[package]] -name = "pest" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" - -[[package]] -name = "postgres" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7871ee579860d8183f542e387b176a25f2656b9fb5211e045397f745a68d1c2" -dependencies = [ - "bytes", - "fallible-iterator", - "futures", - "log", - "tokio", - "tokio-postgres", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3" -dependencies = [ - "base64", - "byteorder", - "bytes", - "fallible-iterator", - "hmac", - "md-5", - "memchr", - "rand", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-types" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - [[package]] name = "proc-macro-error" version = "1.0.4" @@ -1000,18 +469,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" - [[package]] name = "proc-macro2" version = "1.0.29" @@ -1030,81 +487,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "radix_trie" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" -dependencies = [ - "endian-type", - "nibble_vec", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core", -] - -[[package]] -name = "rayon" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" -dependencies = [ - "autocfg", - "crossbeam-deque", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-utils", - "lazy_static", - "num_cpus", -] - [[package]] name = "redox_syscall" version = "0.2.10" @@ -1142,41 +524,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] -name = "rustc_version" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" -dependencies = [ - "semver 0.11.0", -] - -[[package]] -name = "rustc_version" -version = "0.4.0" +name = "rustversion" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver 1.0.4", -] +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "rustyline" -version = "8.2.0" +version = "6.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbd4eaf7a7738f76c98e4f0395253ae853be3eb018f7b0bb57fe1b6c17e31874" +checksum = "6f0d5e7b0219a3eadd5439498525d4765c59b7c993ef0c12244865cd2d988413" dependencies = [ - "bitflags", - "cfg-if", - "clipboard-win", + "cfg-if 0.1.10", "dirs-next", - "fd-lock", "libc", "log", "memchr", "nix", - "radix_trie", "scopeguard", - "smallvec", "unicode-segmentation", "unicode-width", "utf8parse", @@ -1185,9 +550,9 @@ dependencies = [ [[package]] name = "rustyline-derive" -version = "0.4.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db9dfbf470021de34cfaf6983067f460ea19164934a7c2d4b92eec0968eb95f1" +checksum = "54a50e29610a5be68d4a586a5cce3bfb572ed2c2a74227e4168444b7bf4e5235" dependencies = [ "quote", "syn", @@ -1205,30 +570,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] - [[package]] name = "serde" version = "1.0.130" @@ -1260,63 +601,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha2" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b69f9a4c9740d74c5baa3fd2e547f9525fa8088a8a958e0ca2409a514e33f5fa" -dependencies = [ - "block-buffer", - "cfg-if", - "cpufeatures", - "digest", - "opaque-debug", -] - -[[package]] -name = "siphasher" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533494a8f9b724d33625ab53c6c4800f7cc445895924a8ef649222dcb76e938b" - -[[package]] -name = "slab" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "socket2" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "765f090f0e423d2b55843402a07915add955e7d60657db13707a159727326cad" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "str-buf" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44a3643b4ff9caf57abcee9c2c621d6c03d9135e0d8b589bd9afb5992cb176a" - -[[package]] -name = "stringprep" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "strsim" version = "0.8.0" @@ -1340,7 +624,7 @@ version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" dependencies = [ - "heck", + "heck 0.3.3", "proc-macro-error", "proc-macro2", "quote", @@ -1348,10 +632,26 @@ dependencies = [ ] [[package]] -name = "subtle" -version = "2.4.1" +name = "strum" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck 0.4.0", + "proc-macro2", + "quote", + "rustversion", + "syn", +] [[package]] name = "syn" @@ -1382,26 +682,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "thiserror" -version = "1.0.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "threadpool" version = "1.8.1" @@ -1421,100 +701,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "tinyvec" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848a1e1181b9f6753b5e96a092749e29b11d19ede67dfbbd6c7dc7e0f49b5338" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - -[[package]] -name = "tokio" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce" -dependencies = [ - "autocfg", - "bytes", - "libc", - "memchr", - "mio", - "pin-project-lite", - "winapi", -] - -[[package]] -name = "tokio-postgres" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d2b1383c7e4fb9a09e292c7c6afb7da54418d53b045f1c1fac7a911411a2b8b" -dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures", - "log", - "parking_lot", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol", - "postgres-types", - "socket2", - "tokio", - "tokio-util", -] - -[[package]] -name = "tokio-util" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d3725d3efa29485e87311c5b699de63cde14b00ed4d256b8318aa30ca452cd" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "typenum" -version = "1.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63708a265f51345575b27fe43f9500ad611579e764c79edbc2037b1121959ec" - -[[package]] -name = "ucd-trie" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" - -[[package]] -name = "unicode-bidi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" - -[[package]] -name = "unicode-normalization" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-segmentation" version = "1.8.0" diff --git a/Cargo.toml b/Cargo.toml index 57d067d..71a1b4a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,36 +1,30 @@ [package] name = "labeler" -version = "0.1.2" +version = "0.1.4" authors = ["syncpark "] edition = "2018" -[features] -default = ["hyperscan"] - [dependencies] ansi_term = "0.12" anyhow = "1.0" -bigdecimal = "0.2" +bincode = "1.3" chrono = "0.4" -env_logger = "0.8" +env_logger = "0.9" +flate2 = "1.0" glob = "0.3" itertools = "0.10" log = "0.4" +regex = "1.3" num = "0.4" num_cpus = "1" num-derive = "0.3" num-format = "0.4" num-traits = "0.2" percent-encoding = "2.1" -postgres = "0.19" -rayon = "1.5" -regex = "1.5" -rustyline = "8" -rustyline-derive = "0.4" -serde = { version = "1.0", features = ["derive"] } +rustyline = "6.2" +rustyline-derive = "0.3" serde_json = "1.0" +serde = { version = "1.0", features = ["derive"] } structopt = "0.3" +strum = { version = "0.24", features = ["derive"] } threadpool = "1.8" - -[target.'cfg(target_arch = "x86_64")'.dependencies] -hyperscan = { version = "0.2", optional = true } diff --git a/src/bin/labeler.rs b/src/bin/labeler.rs deleted file mode 100644 index de0f361..0000000 --- a/src/bin/labeler.rs +++ /dev/null @@ -1,113 +0,0 @@ -use anyhow::Result; -use labeler::matcher::TitleMatch; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -struct Opt { - #[structopt(short, long)] - force: bool, - - #[structopt(short, long)] - model: String, - - #[structopt( - short, - long, - default_value = "host=localhost user=postgres password=postgres" - )] - dburl: String, -} - -fn main() { - env_logger::init(); - let opt = Opt::from_args(); - if let Err(e) = run(&opt.dburl, opt.force, &opt.model) { - log::error!("{:?}", e); - } -} - -/// # Errors -/// -/// Will return `Err` if database connection failed or autodb_* tables are not exist in database. -pub fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { - let mut titles = TitleMatch::init(dburl, force, model_name)?; - - // Scenario - // - // Round #1. initial step. qualify clusters by http status, cluster size - // TODO: estimate clusters by cluster size - // TODO: change the initial step to start from the outlier events. - let status = titles.qualify_unknowns_by_httpstatus(); - log::info!( - "Round #1: {} clusters are qualified by status.", - status.qualified_count() - ); - log::debug!("{}", status); - if status.qualified_count() > 0 { - titles.add_layer(status); - } - - // Round #2: extend benign and suspicious by tracing ipaddr - // TODO: referer, user-agent, time - loop { - let unknowns = titles.unknown_clusters(); - let (newly_qualified, candidates) = if unknowns == 0 { - log::info!("All clusters are qualified."); - break; - } else { - let v = titles.extend_qualification_by_httpipaddr(); - (v.qualified_count(), v) - }; - - if newly_qualified > 0 { - log::info!( - "Round #{}: {} clusters are qualified by ipaddr.", - titles.layers_depth() + 1, - newly_qualified, - ); - log::debug!("{}", candidates); - titles.add_layer(candidates); - } else { - log::info!("Auto qualification done."); - break; - } - } - - // Round #3: Labeling by the label matching - let labeled = titles.clusters_vs_labels(); - log::info!("{} clusters are labeled.", labeled.qualified_count()); - log::debug!("{}", labeled); - titles.add_layer(labeled); - - // Round #4: merge all qualification and labeling - log::info!("merging all qualification and label matching result."); - let merged = titles.merge_all_qualified(); - - // Round #5: calculate label patterns usage - // - required: cluster vs label matching result - // - TODO: make recommendation for label tokens usage => AUTO processing - // - TODO: collect ipaddr, referer, user-agent - // - TODO: calculate token, ipaddr, ... for suspicious, benign, ... clusters - log::info!("collecting label usage statistics."); - let usage = titles.collect_patterns_usage(&merged); - - // Round #6: relocate patterns - // make recommendations and relocate patterns useful flag. - // - required: qualification result, patterns usage - let updated = titles.auto_enable_disable_patterns(&usage); - if updated > 0 { - log::info!("{} label patterns are relocated.", updated); - } - - // Round #7: calculate label probability - // Calculate probability for each label - // - required: qualification result, patterns usage, relocated patterns. - titles.calculate_label_probability(); - - // Round #8: save it all - if let Err(e) = titles.labeler_save(dburl, &merged, &usage, force) { - log::error!("failed to save: {:?}", e); - } - - Ok(()) -} diff --git a/src/bin/labtune.rs b/src/bin/labtune.rs deleted file mode 100644 index c358571..0000000 --- a/src/bin/labtune.rs +++ /dev/null @@ -1,519 +0,0 @@ -use ansi_term::Style; -use anyhow::Result; -use labeler::labelmap::TopLabels; -use labeler::PatternType; -use log::info; -use rustyline::{config::Configurer, error::ReadlineError}; -use rustyline_derive::{Helper, Highlighter, Hinter, Validator}; -use std::collections::LinkedList; -use std::mem; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -struct Opt { - #[structopt(short, long)] - model: Option, - - #[structopt( - short, - long, - default_value = "host=localhost user=postgres password=postgres" - )] - dburl: String, -} - -fn main() { - env_logger::init(); - let opt = Opt::from_args(); - if let Err(e) = run(&opt.dburl, opt.model.as_deref()) { - log::error!("{:#}", e); - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum CliCmd { - Add(PatternType), - Dict, - Disable, - Enable, - Exit, - Export(bool), - Filter(PatternType), - GoNext, - GoPrev, - Help, - Jump, - Label, - Load(bool), - Import(bool), - TryMatch, - QuitProgram, - Remove(PatternType), - RemoveLabel(bool), - Reverse, - Save(bool), - Status, - Undefined, -} - -#[derive(Helper, Hinter, Highlighter, Validator)] -struct CmdCompleter { - commands: Vec<&'static str>, -} -const CMDLIST: &[&str] = &[ - "/add keywords", - "/add signature", - "/disable", - "/enable", - "/export", - "/filter token", - "/help", - "/label", - "/load", - "/load force", - "/import", - "/import force", - "/quit", - "/remove all", - "/remove label", - "/remove keywords", - "/remove signature", - "/reverse", - "/save", - "/save force", - "/status", - "/x", -]; - -impl rustyline::completion::Completer for CmdCompleter { - type Candidate = String; - fn complete( - &self, - line: &str, - _pos: usize, - _ctx: &rustyline::Context<'_>, - ) -> rustyline::Result<(usize, Vec)> { - let out = self - .commands - .iter() - .filter_map(|cmd| { - if cmd.starts_with(line) { - Some((*cmd).to_string()) - } else { - None - } - }) - .collect(); - Ok((0, out)) - } -} - -const COMMAND_HISTORY_FILE: &str = ".labtune_history.txt"; -const PAGE_SIZE: usize = 30; - -/// # Errors -/// -/// Will return `Err` if database connection failed or labeldb_* tables are not exist in database. -#[allow(clippy::too_many_lines)] -fn run(dburl: &str, model_name: Option<&str>) -> Result<()> { - let mut toplabs = TopLabels::init(dburl, model_name)?; - - let mut rl = rustyline::Editor::::new(); - let completer = CmdCompleter { - commands: CMDLIST.to_vec(), - }; - rl.set_helper(Some(completer)); - rl.set_completion_type(rustyline::CompletionType::List); - mem::drop(rl.load_history(COMMAND_HISTORY_FILE)); - - let mut max_page: usize = toplabs.len() / PAGE_SIZE; - let labels_in_a_page = PAGE_SIZE; - let style = Style::new().reverse(); - let mut reverse: bool = false; - let mut title: String = String::from("Labels"); - let mut tag: String; - let mut curpage: Option = None; - let mut layer = 0_i32; - let mut prompt: LinkedList<(String, Option, usize)> = LinkedList::new(); - - loop { - tag = if layer > 0 { - format!("\n{} (Label #{}) ", style.paint(&title), layer) - } else if curpage.is_none() { - format!("\n{} [{}]# ", style.paint(&title), max_page + 1) - } else { - format!( - "\n{} [{}/{}]# ", - style.paint(&title), - curpage.unwrap_or(0) + 1, - max_page + 1 - ) - }; - let (cmdtype, opt) = get_user_input(&mut rl, &tag); - info!( - "Command: {:?}, option: {:?}, layer: {}", - cmdtype, opt, layer - ); - match cmdtype { - CliCmd::Add(x) => { - if layer > 0 { - if let Some(s) = opt { - if !s.trim().is_empty() { - match toplabs.update_pattern(layer, x, &s, true) { - Ok(_) => println!("{:?} \"{}\" added.", x, s), - Err(e) => println!("{:?}", e), - } - } - } - } - continue; - } - CliCmd::Dict => { - toplabs.show_dict(); - continue; - } - CliCmd::Disable => { - if let Some(s) = opt { - let v: Vec<_> = s.split(' ').collect(); - toplabs.enable_disable_tokens(layer, &v, PatternType::Token, false); - if layer > 0 { - println!("disable tokens of label #{}: {:?}", layer, v); - } else { - println!("disable tokens of all labels: {:?}", v); - } - } - continue; - } - CliCmd::Enable => { - if let Some(s) = opt { - let v: Vec<_> = s.split(' ').collect(); - toplabs.enable_disable_tokens(layer, &v, PatternType::Token, true); - if layer > 0 { - println!("enable tokens of label #{}: {:?}", layer, v); - } else { - println!("enable tokens of all labels: {:?}", v); - } - } - continue; - } - CliCmd::Exit => { - if layer == 0 { - if !prompt.is_empty() && toplabs.remove_filter().is_ok() { - let t = prompt.pop_back().unwrap(); - title = t.0; - curpage = t.1; - max_page = t.2; - } - } else { - layer = 0; - } - continue; - } - CliCmd::Export(x) => { - if x { - toplabs.export_tdb(); - } else { - let target = if let Some(s) = opt { - let v: Vec<_> = s.split(' ').collect(); - v.iter().filter_map(|i| i.parse::().ok()).collect() - } else if layer > 0 { - vec![layer] - } else { - vec![] - }; - toplabs.export_labels(&target); - } - continue; - } - CliCmd::Filter(x) => { - if let Some(s) = opt { - let len = toplabs.filter_by(x, &s); - if len == 0 { - println!("No matched labels.\n"); - } else { - prompt.push_back((title.to_string(), curpage, max_page)); - title = format!("{}({:?} = {})", title, x, s); - max_page = len / PAGE_SIZE; - curpage = None; - println!("{} matched labels.\n", len); - } - continue; - } - } - CliCmd::Help => { - show_help(); - continue; - } - CliCmd::Import(x) => { - if let Some(s) = opt { - if let Err(e) = toplabs.import(x, dburl, &s) { - println!("{:#}", e); - } else { - max_page = toplabs.len() / PAGE_SIZE; - } - } - continue; - } - CliCmd::Jump => { - if layer == 0 { - if let Some(s) = opt { - if let Ok(i) = s.parse::() { - if i > 0 { - curpage = Some(i - 1); - } - } - } - } - } - CliCmd::GoNext | CliCmd::GoPrev => { - if layer == 0 { - curpage = Some(do_goto(cmdtype, curpage, reverse)); - } - } - CliCmd::Load(x) => { - if let Some(s) = opt { - if let Err(e) = toplabs.load_labels_from_files(x, dburl, &s) { - println!("{:#}", e); - } else { - max_page = toplabs.len() / PAGE_SIZE; - } - } - continue; - } - CliCmd::Label => { - if let Some(s) = opt { - if layer == 0 { - if let Some(lab_id) = toplabs.show_labelid(&s) { - layer = lab_id; - } else { - println!("Label #{} not found.", s); - } - } - continue; - } - } - CliCmd::QuitProgram => break, - CliCmd::Remove(x) => { - if layer > 0 { - if let Some(s) = opt { - if s.parse::().is_ok() { - match toplabs.update_pattern(layer, x, &s, false) { - Ok(_) => println!("{:?} #{} removed.", x, s), - Err(e) => println!("{:?}", e), - } - } - } - } else { - println!("this command is allowed only in a label.\n"); - } - continue; - } - CliCmd::RemoveLabel(x) => { - if x { - let n = toplabs.remove_all_labels(); - println!("{} labels will be removed.", n); - } else if let Some(s) = opt { - let v: Vec<_> = s.split(' ').collect(); - for id in &v { - if let Ok(v) = id.parse::() { - if toplabs.remove_labels(v) { - println!("Will be removed #{}", v); - } else { - println!("Label #{} not found", v); - } - } - } - } - continue; - } - CliCmd::Reverse => { - reverse = !reverse; - println!("reverse = {}\n", reverse); - continue; - } - CliCmd::Save(x) => { - if let Err(e) = toplabs.save_labels(dburl, x) { - println!("{:#}", e); - } - if let Err(e) = toplabs.save_dict(dburl) { - println!("{:#}", e); - } - max_page = toplabs.len() / PAGE_SIZE; - continue; - } - CliCmd::Status => { - toplabs.show_status(); - continue; - } - CliCmd::TryMatch => { - if let Some(s) = opt { - if let Ok(_v) = s.parse::() { - //labels.try_match(&model, v); - } - } - continue; - } - CliCmd::Undefined => { - println!("Undefined command!\n"); - continue; - } - } - - if let Some(v) = curpage { - if v >= max_page { - curpage = Some(max_page); - } - } else { - curpage = Some(0); - } - - if layer == 0 { - if let Some(v) = curpage { - toplabs.list_labels(v, labels_in_a_page); - } - } else { - let s = format!("#{}", layer); - let _ = toplabs.show_labelid(&s); - } - } - - rl.save_history(COMMAND_HISTORY_FILE)?; - Ok(()) -} - -fn do_goto(cmd: CliCmd, pageno: Option, reverse: bool) -> usize { - pageno.map_or(0, |v| { - if (cmd == CliCmd::GoNext && !reverse) || (cmd == CliCmd::GoPrev && reverse) { - v + 1 - } else if v == 0 { - 0 - } else { - v - 1 - } - }) -} - -fn get_user_input(rl: &mut rustyline::Editor, tag: &str) -> (CliCmd, Option) { - let input = rl.readline(tag); - let line = match input { - Ok(l) => { - rl.add_history_entry(l.as_str()); - l - } - Err(ReadlineError::Interrupted | ReadlineError::Eof) => { - return (CliCmd::QuitProgram, None); - } - Err(_) => return (CliCmd::Undefined, None), - }; - - let line = line.trim(); - if line.trim().is_empty() { - return (CliCmd::GoNext, None); - } - - if line.len() == 1 { - match line { - "b" | "p" => return (CliCmd::GoPrev, None), - "h" | "?" => return (CliCmd::Help, None), - _ => {} - } - } - - if line.parse::().is_ok() { - return (CliCmd::Jump, Some(line.to_string())); - } else if line.starts_with('#') { - if let Some(s) = line.get(1..) { - if s.parse::().is_ok() { - return (CliCmd::Label, Some((*line).to_string())); - } - } - } - - let mut ls: Vec<&str> = line.split_whitespace().collect(); - let pattern: String; - if ls.len() > 3 { - pattern = ls[2..].join(" "); - ls.resize(2, " "); - ls.push(&pattern); - } - match &ls[..] { - ["/h" | "/help" | "/?"] => return (CliCmd::Help, None), - ["/dict"] => return (CliCmd::Dict, None), - ["/add", "keywords", x] => { - return (CliCmd::Add(PatternType::Keywords), Some((*x).to_string())) - } - ["/add", "signature", x] => { - return (CliCmd::Add(PatternType::Signature), Some((*x).to_string())) - } - ["/disable", x] => return (CliCmd::Disable, Some((*x).to_string())), - ["/disable", x, y] => return (CliCmd::Disable, Some(format!("{} {}", x, y))), - ["/enable", x] => return (CliCmd::Enable, Some((*x).to_string())), - ["/enable", x, y] => return (CliCmd::Enable, Some(format!("{} {}", x, y))), - ["/export"] => return (CliCmd::Export(true), None), - ["/export", x] => return (CliCmd::Export(false), Some((*x).to_string())), - ["/export", x, y] => return (CliCmd::Export(false), Some(format!("{} {}", x, y))), - ["/filter", "token", x] => { - return (CliCmd::Filter(PatternType::Token), Some((*x).to_string())) - } - ["/import", x] => return (CliCmd::Import(false), Some((*x).to_string())), - ["/import", "force", x] => return (CliCmd::Import(true), Some((*x).to_string())), - ["/label", x] => return (CliCmd::Label, Some((*x).to_string())), - ["/load", x] => return (CliCmd::Load(false), Some((*x).to_string())), - ["/load", "force", x] => return (CliCmd::Load(true), Some((*x).to_string())), - ["/remove", "all"] => return (CliCmd::RemoveLabel(true), None), - ["/remove", "label", x] => return (CliCmd::RemoveLabel(false), Some((*x).to_string())), - ["/remove", "keywords", x] => { - return ( - CliCmd::Remove(PatternType::Keywords), - Some((*x).to_string()), - ) - } - ["/remove", "signature", x] => { - return ( - CliCmd::Remove(PatternType::Signature), - Some((*x).to_string()), - ) - } - ["/q" | "/quit"] => return (CliCmd::QuitProgram, None), - ["/r" | "/reverse"] => return (CliCmd::Reverse, None), - ["/save"] => return (CliCmd::Save(false), None), - ["/save", "force"] => return (CliCmd::Save(true), None), - ["/status"] => return (CliCmd::Status, None), - ["/x"] => return (CliCmd::Exit, None), - _ => {} - } - - (CliCmd::Undefined, None) -} - -fn show_help() { - println!( - " - go to next page. - commands auto completion. -/b or b go back to previous page. -/r navigate reverse direction. -/x exit from label mode. -# get into the label mode and show defail information of the label. - -/load [force] load labels from files, or overwrite if force option set. -/import [force] import threat description db and overwrite same labels if force option set. -/export [ ...] export label to .json file or all labels to .tdb file. - -/add keyword|signature add keywords or signature to the label. (only in label mode.) -/disable ... disable tokens and add those tokens to dictionary. -/enable ... enable tokens and remove those tokens from dictionary. -/filter token filter labels including token. -/quit or /q quit this program. -/remove keyword|signature remove keywords or signature from the label. (only in label mode.) -/remove label ... remove the specified labels. -/remove all remove all labels. -/save [force] save or overwrite(with force option) modified things to database. - -/status show tokens usage log if exist. -/help or /? or ? show help message.\n" - ); -} -// TODO -// /match try to match the label with model data. -// /test label or token 실시간 매칭 diff --git a/src/cluster.rs b/src/cluster.rs index f082678..2a314b6 100644 --- a/src/cluster.rs +++ b/src/cluster.rs @@ -1,98 +1,78 @@ -use crate::database::AutoDb; -use crate::events::{EventTokens, MessageId}; -use crate::labelmap::TopLabels; -use crate::matcher::{ClusterMatchUp, MatchUp}; -use crate::weblog_parser::HttpStatus; -use crate::{ - CliConf, ClusterId, ConfigType, FilterOp, FilterType, LabelId, PatternId, Qualifier, - MAX_QUALIFIERS, -}; - +use crate::config::Load; +use crate::events::Events; +use crate::labels::Labels; +use crate::{CliConf, ClusterId, FilterOp, FilterType, MessageId, Qualifier, Score}; use anyhow::Result; -use num::ToPrimitive; +use log::info; use regex::Regex; -use std::collections::{HashMap, HashSet}; +use serde::Deserialize; +use std::collections::HashMap; use std::fmt; use std::str::FromStr; -const CLUSTER_ID_DISPLAY_LENGTH: usize = 50; const SIGNATURE_DISPLAY_LENGTH: usize = 200; +#[derive(Deserialize)] +struct SavedClusters { + detector_id: i32, + events_count: usize, + clusters_count: usize, + outlier_count: usize, + clusters: Vec, + outliers: Vec, +} + +#[derive(Deserialize)] +struct ClusterMember { + cluster_id: usize, + cluster_size: usize, + events: Vec, +} + +impl Load for SavedClusters {} + +impl SavedClusters { + fn cluster_ids(&self) -> Vec { + let mut clusters: Vec<_> = self.clusters.iter().map(|c| c.cluster_id).collect(); + clusters.sort_unstable(); + clusters + } + + pub fn attributes(&self) -> (i32, usize, usize, usize) { + ( + self.detector_id, + self.events_count, + self.clusters_count, + self.outlier_count, + ) + } +} + #[derive(Debug, Default, Clone)] -pub struct BaseClst { +pub struct Members { id: ClusterId, - cluster_id: String, - size: i64, - score: f64, + size: usize, + score: Score, qualifier: Qualifier, new_qualifier: Qualifier, signature: Option, event_ids: Vec, - tokens: HashMap>, // TODO: calculate token occurrences to correct label-score - status: Vec<(HttpStatus, usize)>, - ipaddrs: Vec, + // tokens: HashMap>, // TODO: calculate token occurrences to correct label-score } -impl fmt::Display for BaseClst { +impl fmt::Display for Members { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "#{} qualifier = {}, id = {}, event = {}, score = {:.4}", - self.id, - self.qualifier, - self.cluster_id(), - self.size, - self.score, - )?; - - if self.qualifier != self.new_qualifier { - write!(f, " -> {}", self.new_qualifier)?; - } - Ok(()) - } -} - -impl BaseClst { - #[must_use] - pub fn new( - id: ClusterId, - cluster_id: String, - size: i64, - score: f64, - qualifier: Qualifier, - signature: Option, - event_ids: Vec, - ) -> Self { - BaseClst { - id, - cluster_id, - size, - score, - qualifier, - new_qualifier: qualifier, - signature, - event_ids, - tokens: HashMap::new(), - status: Vec::new(), - ipaddrs: Vec::new(), - } - } - - #[must_use] - pub fn cluster_id(&self) -> String { - if self.cluster_id.len() > CLUSTER_ID_DISPLAY_LENGTH { - format!( - "{}... ({})", - self.cluster_id - .get(..CLUSTER_ID_DISPLAY_LENGTH) - .unwrap_or(""), - self.cluster_id.len() - ) + write!(f, " cluster {}", self.id)?; + if self.qualifier == self.new_qualifier { + write!(f, ", {}", self.new_qualifier)?; } else { - self.cluster_id.clone() + write!(f, ", {}<-{}", self.new_qualifier, self.qualifier)?; } + write!(f, ", {} events", self.size) } +} +impl Members { #[must_use] pub fn signature(&self) -> Option { if let Some(s) = &self.signature { @@ -110,29 +90,21 @@ impl BaseClst { } } - pub fn reset_qualifier(&mut self) -> Qualifier { - if self.qualifier != self.new_qualifier { - self.new_qualifier = self.qualifier; + pub fn set_qualifier(&mut self, qualifier: Qualifier) -> bool { + if self.new_qualifier != qualifier { + self.new_qualifier = qualifier; + return true; } - self.qualifier - } - - pub fn set_qualifier(&mut self, q: Qualifier) -> Qualifier { - self.new_qualifier = q; - q - } - - fn make_cluster_tokens(&mut self, events: &EventTokens) { - self.tokens = events.make_tokens_events_map(&self.event_ids); + false } } #[derive(Debug, Default, Clone)] pub struct Clusters { clusters: Vec, - clusters_map: HashMap, - ip_clusters_map: HashMap>, - message_cluster_map: HashMap, + _outliers: Vec, + clusters_map: HashMap, + // _message_cluster_map: HashMap, tokens_clusters_map: HashMap>, } @@ -140,62 +112,88 @@ impl Clusters { /// # Errors /// /// Will return `Err` if the query to get cluster records for the specified datasource failed. - pub fn load_clusters(&mut self, db: &mut AutoDb, model_id: i32) -> Result<()> { - let (clusters, clusters_map) = db.get_clusters(model_id)?; - - let mut message_cluster_map: HashMap = HashMap::new(); - for (cid, c) in &clusters_map { - c.event_ids.iter().for_each(|msgid| { - message_cluster_map.insert(*msgid, *cid); - }); + pub fn new(path: &str, labels: &Labels) -> Result { + let save_clusters = SavedClusters::from_path(path)?; + { + let (detector_id, events_count, clusters_count, outliers_count) = + save_clusters.attributes(); + info!( + "{} loaded. detector {}, {} events, {} clusters, {} outliers", + path, detector_id, events_count, clusters_count, outliers_count + ); } + let clusters = save_clusters.cluster_ids(); + let clusters_map: HashMap = save_clusters + .clusters + .iter() + .map(|m| { + let qualifier = if labels.is_labeled(m.cluster_id) { + Qualifier::Suspicious + } else { + Qualifier::default() + }; + ( + m.cluster_id, + Members { + id: m.cluster_id, + size: m.cluster_size, + score: 0.0, + qualifier, + new_qualifier: qualifier, + signature: None, + event_ids: m.events.clone(), + }, + ) + }) + .collect(); + + // let _message_cluster_map: HashMap = clusters_map + // .values() + // .flat_map(|c| { + // c.event_ids + // .iter() + // .map(|e| (e.to_string(), c.id)) + // .collect::>() + // }) + // .collect(); + + Ok(Self { + clusters, + _outliers: save_clusters.outliers, + clusters_map, + // _message_cluster_map, + tokens_clusters_map: HashMap::new(), + }) + } - self.clusters = clusters; - self.clusters_map = clusters_map; - self.message_cluster_map = message_cluster_map; - Ok(()) + pub fn event_ids(&self) -> Vec { + self.clusters_map + .values() + .flat_map(|c| c.event_ids.clone()) + .collect() } - /// # Errors - /// - /// Will return `Err` if the postgres connection is lost or it failed to update - /// `qualifier_id` of `cluster` table. - pub fn save_cluster_qualifier(&mut self, db: &mut AutoDb, model_id: i32) -> Result { - let mut modified: HashMap> = HashMap::new(); - self.clusters_map.iter().for_each(|(cid, c)| { - if c.qualifier != c.new_qualifier { - let qid = c.new_qualifier.to_i32().unwrap_or(2); // 2 => Unknown - if let Some(v) = modified.get_mut(&qid) { - v.push(*cid); - } else { - modified.insert(qid, vec![*cid]); + pub fn init_event_tokens(&mut self, events: &Events) { + let mut tokens_clusters_map: HashMap> = HashMap::new(); + for cd in self.clusters_map.values() { + for message_id in &cd.event_ids { + if let Some(tokens) = events.tokens(message_id) { + for token in tokens { + tokens_clusters_map + .entry(token.to_string()) + .and_modify(|cs| cs.push(cd.id)) + .or_insert_with(|| vec![cd.id]); + } } } - }); - - let mut cnt = 0_u64; - if !modified.is_empty() { - cnt = db.save_cluster_qualifier(&modified, model_id)?; - - self.clusters_map.iter_mut().for_each(|(_, c)| { - if c.qualifier != c.new_qualifier { - c.qualifier = c.new_qualifier; - } - }); } - Ok(cnt) - } - #[must_use] - pub fn find_cluster(&self, msgid: MessageId) -> Option<&ClusterId> { - self.message_cluster_map.get(&msgid) - } + for cs in tokens_clusters_map.values_mut() { + cs.sort_unstable(); + cs.dedup(); + } - pub fn clear_qualifiers(&mut self) { - self.clusters_map.iter_mut().for_each(|(_, c)| { - c.qualifier = Qualifier::default(); - c.new_qualifier = Qualifier::default(); - }); + self.tokens_clusters_map = tokens_clusters_map; } #[must_use] @@ -203,154 +201,41 @@ impl Clusters { self.clusters.len() } - #[must_use] - pub fn is_empty(&self) -> bool { - self.clusters.is_empty() - } - - pub fn get_cluster_httpstatus( - &mut self, - events: &EventTokens, - ) -> Vec<(ClusterId, Vec<(HttpStatus, usize)>)> { + pub fn size(&self, cluster_id: ClusterId) -> usize { self.clusters_map - .iter_mut() - .filter_map(|(cid, c)| { - if c.event_ids.is_empty() { - None - } else { - c.status = events.httplog_statuscodes(&c.event_ids); - Some((*cid, c.status.clone())) - } - }) - .collect() + .get(&cluster_id) + .map(|c| c.size) + .unwrap_or_default() } - // idea: - // 1. weight for each ipaddr. 확정적인 suspicious cluster에서 가져온 IP주소는 더 높은 점수 부여? #[must_use] - pub fn get_http_remoteaddrs( - &self, - candidates: &[ClusterId], - events: &EventTokens, - ) -> HashMap> { - let mut clusters_by_ipaddr: HashMap> = HashMap::new(); - if !candidates.is_empty() { - let addrs: Vec<(ClusterId, Vec)> = candidates - .iter() - .filter_map(|cid| { - self.clusters_map.get(cid).map(|c| { - let iplist = events.httplog_remoteaddrs(&c.event_ids); - (*cid, iplist) - }) - }) - .collect(); - for (cid, iplist) in &addrs { - for ipaddr in iplist.iter() { - if let Some(cluster_list) = clusters_by_ipaddr.get_mut(ipaddr) { - cluster_list.push(*cid); - } else { - clusters_by_ipaddr.insert(ipaddr.to_string(), vec![*cid]); - } - } - } - } - clusters_by_ipaddr - } - - pub fn make_clusters_ipaddr_map(&mut self, events: &EventTokens) { - let mut ip_clusters_map: HashMap> = HashMap::new(); - self.clusters_map.iter_mut().for_each(|(cid, c)| { - let iplist = events.httplog_remoteaddrs(&c.event_ids); - c.ipaddrs = iplist.clone(); - for ip in &iplist { - if let Some(cv) = ip_clusters_map.get_mut(ip) { - cv.push(*cid); - } else { - ip_clusters_map.insert(ip.to_string(), vec![*cid]); - } - } - }); - self.ip_clusters_map = ip_clusters_map; - } - - pub fn make_tokens_clusters_map(&mut self, events: &EventTokens) { - self.clusters_map.iter_mut().for_each(|(_, c)| { - c.make_cluster_tokens(events); - }); - - let mut tokens_clusters_map: HashMap> = HashMap::new(); - self.clusters_map.iter().for_each(|(cid, c)| { - c.tokens.iter().for_each(|(tok, _)| { - if let Some(cv) = tokens_clusters_map.get_mut(tok) { - cv.push(*cid); - } else { - tokens_clusters_map.insert(tok.to_string(), vec![*cid]); - } - }); - }); - for cv in tokens_clusters_map.values_mut() { - cv.sort_unstable(); - cv.dedup(); - } - self.tokens_clusters_map = tokens_clusters_map; - } - - #[must_use] - pub fn cluster_vs_labels_by_tokens(&mut self, toplabs: &TopLabels) -> ClusterMatchUp { - // match cluster tokens with label tokens (not keywords, nor signature) - // the match does not mean that the cluster is suspicious - let mut matchup_result: ClusterMatchUp = HashMap::new(); - self.clusters_map.iter_mut().for_each(|(cid, c)| { - let mut rst: HashMap> = HashMap::new(); - c.tokens.iter().for_each(|(ctok, _)| { - if let Some(labtok_ids) = toplabs.find_pattern(ctok) { - for (lab_id, pid) in labtok_ids { - if let Some(pid_list) = rst.get_mut(lab_id) { - pid_list.push(*pid); - } else { - rst.insert(*lab_id, vec![*pid]); - } - } - } - }); - - if !rst.is_empty() { - let mut mu = MatchUp::default(); - for (lab_id, pid_list) in &rst { - mu.append_matched(*lab_id, pid_list.clone(), 0.0); - } - matchup_result.insert(*cid, mu); - } - }); - matchup_result + pub fn is_empty(&self) -> bool { + self.clusters.is_empty() } - pub fn print(&self, cid: ClusterId, events: &EventTokens, cfg: &CliConf) { + pub fn print(&self, cid: ClusterId, events: &Events, cfg: &CliConf) { if let Some(c) = self.clusters_map.get(&cid) { println!("{}", c); - if cfg.show_signature == ConfigType::Signature(true) { + if cfg.is_show_signature_on() { if let Some(sig) = c.signature() { println!("signature = {}", sig); } } - if cfg.show_samples == ConfigType::Samples(true) { + if cfg.is_show_samples_on() { + let display_count = cfg.samples_count(); println!(); - events.show_samples(&c.event_ids, cfg.csv_style == ConfigType::CsvStyle(true)); - } - //log::debug!("status = {:?}", self.clusters[*cidx].status); - } - } - - #[must_use] - pub fn cluster_statistics(&self, clusters: &[ClusterId]) -> [usize; MAX_QUALIFIERS] { - let mut cnt = [0_usize; MAX_QUALIFIERS]; - for cid in clusters { - if let Some(c) = self.clusters_map.get(cid) { - let idx = c.new_qualifier.to_u32().unwrap_or(0) as usize; - cnt[idx - 1] += 1; + for (idx, message_id) in c.event_ids.iter().enumerate() { + if idx > display_count { + break; + } + if let Some(msg) = events.get_message(message_id) { + println!("{}", msg); + } else { + println!("{}", message_id); + } + } } } - cnt } #[must_use] @@ -366,52 +251,37 @@ impl Clusters { op: FilterOp, value: &str, ) -> Vec { - let score = if ft == FilterType::Score { - value.parse::().unwrap_or(0.0) - } else { - 0.0 - }; - - let count = if ft == FilterType::Count { - value.parse::().unwrap_or(0) - } else { - 0 - }; - - let qualifier = match ft { - FilterType::Qualifier => { - if let Ok(q) = Qualifier::from_str(value) { - q - } else { - return vec![]; - } - } - _ => return vec![], - }; - - let error = f64::EPSILON; clusters .iter() .filter_map(|cid| { - self.clusters_map.get(cid).and_then(|c| { + if let Some(c) = self.clusters_map.get(cid) { let matched = match ft { - FilterType::Count => match op { - FilterOp::L => c.size < count, - FilterOp::G => c.size > count, - FilterOp::Le => c.size <= count, - FilterOp::Ge => c.size >= count, - FilterOp::Eq => c.size == count, - FilterOp::Ne => c.size != count, - }, - FilterType::Score => match op { - FilterOp::L => c.score < score, - FilterOp::G => c.score > score, - FilterOp::Le => c.score <= score, - FilterOp::Ge => c.score >= score, - FilterOp::Eq => (c.score - score).abs() < error, - FilterOp::Ne => (c.score - score).abs() > error, - }, - FilterType::Qualifier => c.new_qualifier == qualifier, + FilterType::Count => { + let count = value.parse::().unwrap_or_default(); + match op { + FilterOp::L => c.size < count, + FilterOp::G => c.size > count, + FilterOp::LE => c.size <= count, + FilterOp::GE => c.size >= count, + FilterOp::EQ => c.size == count, + FilterOp::NE => c.size != count, + } + } + FilterType::Score => { + let score = value.parse::().unwrap_or_default(); + match op { + FilterOp::L => c.score < score, + FilterOp::G => c.score > score, + FilterOp::LE => c.score <= score, + FilterOp::GE => c.score >= score, + FilterOp::EQ => (c.score - score).abs() < f32::EPSILON, + FilterOp::NE => (c.score - score).abs() > f32::EPSILON, + } + } + FilterType::Qualifier => { + let qualifier = Qualifier::from_str(value).unwrap_or_default(); + c.new_qualifier == qualifier + } _ => false, }; @@ -420,75 +290,40 @@ impl Clusters { } else { None } - }) + } else { + None + } }) .collect() } - #[must_use] pub fn regex_match( &self, clusters: &[ClusterId], - re: &Regex, - events: &EventTokens, - ) -> Vec { - clusters + pattern: &str, + events: &Events, + ) -> Result> { + let re = Regex::new(pattern)?; + Ok(clusters .iter() .filter_map(|cid| { - self.clusters_map.get(cid).and_then(|c| { - if events.regex_match(re, &c.event_ids) { + if let Some(c) = self.clusters_map.get(cid) { + if events.regex_match(&re, &c.event_ids) { Some(*cid) } else { None } - }) + } else { + None + } }) - .collect() - } - - #[must_use] - pub fn is_qualifier_match(&self, cid: ClusterId, qualifier: Qualifier) -> bool { - self.clusters_map - .get(&cid) - .map_or(false, |c| c.new_qualifier == qualifier) + .collect()) } pub fn set_qualifier(&mut self, cid: ClusterId, qualifier: Qualifier) -> bool { if let Some(c) = self.clusters_map.get_mut(&cid) { - if c.new_qualifier != qualifier { - c.new_qualifier = qualifier; - return true; - } + return c.set_qualifier(qualifier); } false } - - #[must_use] - pub fn get_clusters_share_ipaddr( - &self, - clusters: &[ClusterId], - unknowns: &[ClusterId], - ) -> HashSet { - let mut ipaddrs: HashSet = HashSet::new(); - for cid in clusters { - if let Some(c) = self.clusters_map.get(cid) { - c.ipaddrs.iter().for_each(|ip| { - ipaddrs.insert(ip.to_string()); - }); - } - } - - let mut rst: HashSet = HashSet::new(); - for cid in unknowns { - if let Some(c) = self.clusters_map.get(cid) { - for ip in &c.ipaddrs { - if ipaddrs.contains(ip) { - rst.insert(*cid); - break; - } - } - } - } - rst - } } diff --git a/src/clustermap.rs b/src/clustermap.rs deleted file mode 100644 index ddeb9e0..0000000 --- a/src/clustermap.rs +++ /dev/null @@ -1,20 +0,0 @@ -use crate::database::AutoDB; -use crate::Modified; - -use anyhow::Result; -use std::collections::HashMap; -use std::fmt; - -#[derive(Default)] -pub struct TopClusters { - pub layers: Vec -} - -impl fmt::Display for TopClusters { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - OK(()) - } -} - -impl TopClusters { -} \ No newline at end of file diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..711a118 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,149 @@ +use crate::EventType; +use anyhow::{Context, Result}; +use serde::{Deserialize, Serialize}; +use std::{fs::File, io::BufReader, path::Path}; + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum ColumnType { + Datetime, + Enum, + Float64, + Int64, + Ipaddr, + Utf8, + Binary, +} + +pub trait Load +where + for<'de> Self: Deserialize<'de> + Sized, +{ + /// # Errors + /// + /// Will return `Err` if file not found or no permission or json syntax error + fn from_path + std::fmt::Display>(path: P) -> Result { + let file = File::open(&path)?; + serde_json::from_reader(BufReader::new(file)) + .with_context(|| format!("cannot open {}", &path)) + } +} + +#[derive(Debug, Deserialize)] +pub struct Config { + event_type: EventType, + time_column: usize, + format: Vec, + input_log: String, + input_clusters: String, + input_labels: String, + tidb: String, // directory name + #[serde(default = "default_keycolumn")] + key_column: String, // must match alias field name + #[serde(default = "default_delimiter")] + delimiter: char, +} + +fn default_delimiter() -> char { + ',' +} + +fn default_keycolumn() -> String { + "uid".to_string() +} + +#[derive(Debug, Deserialize)] +struct ColumnFormat { + data_type: ColumnType, + #[serde(default = "Default::default")] + weight: f64, + format: Option, + alias: String, +} + +impl Config { + #[must_use] + pub fn event_type(&self) -> EventType { + self.event_type + } + + #[must_use] + pub fn clusters(&self) -> &str { + &self.input_clusters + } + + #[must_use] + pub fn column_len(&self) -> usize { + self.format.len() + } + + #[must_use] + pub fn delimiter(&self) -> char { + self.delimiter + } + + #[must_use] + pub fn events(&self) -> &str { + &self.input_log + } + + #[must_use] + pub fn features(&self) -> Vec { + self.format + .iter() + .enumerate() + .filter_map(|(idx, col)| { + if col.weight > 0.0 { + return Some(idx); + } + + None + }) + .collect() + } + + #[must_use] + pub fn key_field(&self) -> Option { + self.format + .iter() + .position(|column| column.alias == self.key_column) + } + + #[must_use] + pub fn time_format(&self) -> Option<&str> { + self.format.get(self.time_column).and_then(|c| { + if c.data_type == ColumnType::Datetime { + c.format.as_deref() + } else { + None + } + }) + } + + #[must_use] + pub fn labels(&self) -> &str { + &self.input_labels + } + + #[must_use] + pub fn tidb(&self) -> &str { + &self.tidb + } +} + +impl Load for Config {} + +impl Config { + /// # Panics + /// * if config file has invalid json format + #[must_use] + pub fn init(config_path: &str) -> Self { + match Config::from_path(config_path) { + Ok(c) => c, + Err(e) => { + log::error!("{:?}", e); + std::process::exit(-1); + } + } + } +} diff --git a/src/database.rs b/src/database.rs deleted file mode 100644 index add4782..0000000 --- a/src/database.rs +++ /dev/null @@ -1,827 +0,0 @@ -use crate::cluster::BaseClst; -use crate::events::{Message, MessageId, MessageStatus}; -use crate::labels::{Label, Pattern}; -use crate::matcher::{ClusterMatchUp, MatchUp}; -use crate::{ - ClusterId, DataType, DatetimeFormat, LabelId, Modified, PatternType, Qualifier, MAX_QUALIFIERS, - ORDERED_QUALIFIERS, -}; - -use anyhow::{Context, Result}; -use num::ToPrimitive; -use num_traits::FromPrimitive; -use postgres::types::Type; -use postgres::{Client, NoTls}; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::convert::TryFrom; -use std::fmt; -use std::str::FromStr; - -const DB_UPDATE_IN_A_QUERY: usize = 500; - -#[derive(Deserialize, Serialize)] -pub struct TokenUsage { - suspicious: i32, - benign: i32, - unknown: i32, - mixed: i32, -} - -#[derive(Debug, Default, Clone)] -pub struct Datasource { - id: i32, - model_name: String, - logtype: DataType, - datetype: DatetimeFormat, -} - -impl fmt::Display for Datasource { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Model #{} {}, log type={:?}, datetime type={}", - self.id, self.model_name, self.logtype, self.datetype, - ) - } -} - -impl Datasource { - #[must_use] - pub fn fields(&self) -> (i32, &str, DataType, DatetimeFormat) { - (self.id, &self.model_name, self.logtype, self.datetype) - } - - #[must_use] - pub fn model_id(&self) -> i32 { - self.id - } - - pub fn set(&mut self, logtype: DataType, datetype: DatetimeFormat) { - self.logtype = logtype; - self.datetype = datetype; - } -} - -pub struct AutoDb { - conn: postgres::Client, -} - -impl AutoDb { - /// # Errors - /// - /// Will return `Err` if `cfg` does not include - /// valid hostname(or ip address), database name, user name like, - /// `host=localhost user=postgres password=postgres` - pub fn new(cfg: &str) -> Result { - let conn = Client::connect(cfg, NoTls).context("connect to database")?; - Ok(AutoDb { conn }) - } - - /// # Errors - /// - /// Will return `Err` if there's no datasource with name `model_name` - /// in `data_source` table. - pub fn get_datasource(&mut self, model_name: &str) -> Result { - let query = self.conn.prepare( - "SELECT m.id, d.data_type, m.name FROM model m, data_source d - WHERE m.name = $1 AND d.topic_name = m.topic_name", - )?; - - let row = self - .conn - .query_one(&query, &[&model_name]) - .with_context(|| format!("read model \"{}\"", model_name))?; - - let t = Datasource { - id: row.get("id"), - logtype: DataType::from_str(row.get("data_type")).unwrap_or(DataType::Unknown), - model_name: row.get("name"), - ..Datasource::default() - }; - - Ok(t) - } - - /// # Errors - /// - /// Will return `Err` if the query to get cluster records for the specified datasource failed. - pub fn get_clusters( - &mut self, - model_id: i32, - ) -> Result<(Vec, HashMap)> { - let q = self.conn.prepare( - "SELECT id, cluster_id, size::BIGINT, COALESCE(score, 0) AS score, qualifier_id, signature, event_ids::BIGINT[] - FROM cluster - WHERE model_id = $1 - ORDER BY 1" - )?; - - let mut clusters = Vec::::new(); - let mut clusters_map: HashMap = HashMap::new(); - for row in self - .conn - .query(&q, &[&model_id]) - .context("read cluster list")? - { - let id: ClusterId = row.get("id"); - let qualifier = - Qualifier::from_i32(row.get("qualifier_id")).map_or_else(Qualifier::default, |q| q); - - clusters.push(id); - clusters_map.insert( - id, - BaseClst::new( - row.get("id"), - row.get("cluster_id"), - row.get("size"), - row.get("score"), - qualifier, - row.get("signature"), - row.get("event_ids"), - ), - ); - } - - Ok((clusters, clusters_map)) - } - - /// # Errors - /// - /// Will return `Err` if the query to get raw events failed. - pub fn get_raw_events( - &mut self, - model_id: i32, - force: bool, - ) -> Result> { - let q = self.conn.prepare( - "SELECT e.message_id::bigint, e.raw_event, a.fields - FROM cluster c, event e - LEFT JOIN autodb_fields a ON a.model_id = e.model_id AND a.message_id = e.message_id - WHERE c.model_id = $1 AND e.model_id = c.model_id AND - e.message_id = ANY(c.event_ids) AND e.raw_event IS NOT NULL", - )?; - - let row = self - .conn - .query(&q, &[&model_id]) - .with_context(|| format!("read raw events for model #{}", model_id))?; - - let mut events: HashMap = HashMap::new(); - for e in &row { - let content = String::from_utf8_lossy(e.get("raw_event")).to_string(); - let message_id: MessageId = e.get("message_id"); - let fields: Option> = e.get("fields"); - let fields = if force { - vec![] - } else if let Some(f) = fields { - f - } else { - vec![] - }; - events.insert( - message_id, - Message::new(message_id, Modified::No, content, fields, vec![]), - ); - } - Ok(events) - } - - /// # Errors - /// - /// Will return `Err` if the query to get raw events failed. - pub fn get_autodb_tokens( - &mut self, - model_id: i32, - ) -> Result, MessageStatus>> { - let q = self.conn.prepare( - "SELECT message_ids::bigint[], tokens FROM autodb_tokens WHERE model_id = $1", - )?; - - let row = self - .conn - .query(&q, &[&model_id]) - .with_context(|| format!("read events tokens for model #{}", model_id))?; - - let mut tokens_events_map: HashMap, MessageStatus> = HashMap::new(); - let mut cnt = 0_usize; - for e in &row { - let message_ids: Vec = e.get("message_ids"); - let tokens: Vec = e.get("tokens"); - cnt += message_ids.len(); - tokens_events_map.insert(tokens, MessageStatus::new(Modified::No, message_ids)); - } - log::info!("{} messages have tokens.", cnt); - Ok(tokens_events_map) - } - - /// # Errors - /// - /// Will return `Err` if `cluster` table update query failed. - pub fn save_cluster_qualifier( - &mut self, - update_list: &HashMap>, - model_id: i32, - ) -> Result { - let statement = self.conn.prepare_typed( - "UPDATE cluster SET qualifier_id=$1 WHERE model_id=$2 AND id = ANY ($3)", - &[Type::INT4, Type::INT4, Type::INT4_ARRAY], - )?; - - let mut updated: u64 = 0; - for (qid, cv) in update_list { - let mut start: usize = 0; - let mut end: usize = DB_UPDATE_IN_A_QUERY; - loop { - if end > cv.len() { - end = cv.len(); - } - - let v = cv[start..end].to_vec(); - updated += self - .conn - .execute(&statement, &[qid, &model_id, &v]) - .context("update cluster qualifier")?; - - if end >= cv.len() { - break; - } - - start = end; - end += DB_UPDATE_IN_A_QUERY; - } - } - - Ok(updated) - } - - /// # Errors - /// - /// Will return `Err` if the postgres connection fails or `autodb_tokens` tables are not exist. - pub fn save_autodb_tokens( - &mut self, - model_id: i32, - events: &HashMap, - tokens_events_map: &HashMap, MessageStatus>, - force: bool, - ) -> Result { - //TODO: use "copy statement" for multiple insert in a query. - // ex) COPY categories (category) FROM STDIN (FORMAT BINARY) - - if force { - let _ = self - .conn - .execute("DELETE FROM autodb_fields WHERE model_id=$1", &[&model_id])?; - let _ = self - .conn - .execute("DELETE FROM autodb_tokens WHERE model_id=$1", &[&model_id])?; - } /* else { - let row = self.conn.query_one( - "SELECT count(*) AS cnt FROM autodb_tokens WHERE model_id=$1", - &[&model_id], - )?; - let cnt: i64 = row.get("cnt"); - if cnt > 0 { - log::info!( - "The tokens for model #{} already exist! Nothing saved.", - model_id - ); - return Ok(()); - } - } */ - - let q = self.conn.prepare_typed( - "INSERT INTO autodb_fields (model_id, message_id, fields) VALUES ($1, $2, $3)", - &[Type::INT4, Type::INT8, Type::TEXT_ARRAY], - )?; - - let mut cnt: u64 = 0; - for msg in events.values() { - if msg.is_new() { - let (id, _, _, fields, _) = msg.fields(); - cnt += self.conn.execute(&q, &[&model_id, &id, &fields])?; - } - } - log::debug!("model #{}: {} fields inserted", model_id, cnt); - - let q_update = self.conn.prepare_typed( - "UPDATE autodb_tokens set message_ids = $1 WHERE model_id = $2 AND tokens = $3", - &[Type::INT8_ARRAY, Type::INT4, Type::TEXT_ARRAY], - )?; - let q_inst = self.conn.prepare_typed( - "INSERT INTO autodb_tokens (model_id, message_ids, tokens) - VALUES ($1, $2, $3)", - &[Type::INT4, Type::INT8_ARRAY, Type::TEXT_ARRAY], - )?; - - cnt = 0; - for (toks, mss) in tokens_events_map { - if force || mss.is_new() { - cnt += self - .conn - .execute(&q_inst, &[&model_id, &mss.message_ids(), &toks])?; - } else if mss.is_modified() { - cnt += self - .conn - .execute(&q_update, &[&mss.message_ids(), &model_id, &toks])?; - } - } - Ok(cnt) - } - - /// # Errors - /// - /// Will return `Err` if the query to create autodb tables are failed. - pub fn initialize_autodb(&mut self) -> Result<()> { - let mut tables: HashMap<&str, &str> = HashMap::new(); - tables.insert( - "autodb_tokens", - "CREATE TABLE autodb_tokens ( - model_id INTEGER NOT NULL, - message_ids NUMERIC(20, 0)[], - tokens TEXT[] - )", - ); - tables.insert( - "autodb_fields", - "CREATE TABLE autodb_fields ( - model_id INTEGER NOT NULL, - message_id NUMERIC(20, 0), - fields TEXT[] - )", - ); - tables.insert( - "autodb_match", - "CREATE TABLE autodb_match ( - model_id INTEGER NOT NULL, - cluster_id INTEGER NOT NULL, - label_id INTEGER NOT NULL, - prob DOUBLE PRECISION NOT NULL, - modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP - )", - ); - for (tab, schema) in &tables { - if !self.check_table(tab) { - let q = self.conn.prepare(schema)?; - self.conn.execute(&q, &[])?; - } - } - Ok(()) - } - - fn check_table(&mut self, name: &str) -> bool { - if let Ok(row) = self.conn.query_one( - "SELECT EXISTS(SELECT FROM pg_tables WHERE tablename = $1)", - &[&name], - ) { - row.get("exists") - } else { - false - } - } - - /// # Errors - /// - /// Will return `Err` if the query to create labeldb tables are failed. - pub fn initialize_labeldb(&mut self) -> Result<()> { - let mut tables: HashMap<&str, &str> = HashMap::new(); - tables.insert( - "labeldb_labels", - "CREATE TABLE labeldb_labels ( - id INTEGER GENERATED ALWAYS AS IDENTITY (START WITH 100001), - label TEXT NOT NULL, - refers TEXT[], - description TEXT, - modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP - )", - ); - tables.insert( - "labeldb_samples", - "CREATE TABLE labeldb_samples ( - label_id INTEGER NOT NULL, - samples TEXT[] NOT NULL, - modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP - )", - ); - tables.insert( - "labeldb_tokens", - "CREATE TABLE labeldb_tokens ( - label_id INTEGER NOT NULL, - type INTEGER NOT NULL DEFAULT 1, - pattern_id INTEGER NOT NULL DEFAULT 0, - token TEXT[] NOT NULL, - use BOOLEAN NOT NULL, - prob DOUBLE PRECISION NOT NULL, - modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP - )", - ); - tables.insert( - "labeldb_disables", - "CREATE TABLE labeldb_disables ( - id SERIAL, - tokens TEXT[] NOT NULL, - modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP - )", - ); - tables.insert( - "labeldb_match", - "CREATE TABLE labeldb_match ( - model_id INTEGER NOT NULL, - cluster_id INTEGER NOT NULL, - label_id INTEGER NOT NULL, - pattern_ids INTEGER[], - prob DOUBLE PRECISION NOT NULL DEFAULT 0 - )", - ); - tables.insert( - "labeldb_tokens_usage", - "CREATE TABLE labeldb_tokens_usage ( - model_id INTEGER NOT NULL, - type INTEGER NOT NULL DEFAULT 0, - pattern TEXT NOT NULL, - usage JSON NOT NULL - )", - ); - for (tab, schema) in &tables { - if !self.check_table(tab) { - let q = self.conn.prepare(schema)?; - self.conn.execute(&q, &[])?; - } - } - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or `labeldb_*` tables are not exist. - pub fn get_labels(&mut self) -> Result<(Vec, HashMap)> { - let mut labels: Vec = Vec::new(); - let mut labels_map: HashMap = HashMap::new(); - let row = self.conn.query( - "SELECT l.id, l.label, l.refers, l.description, s.samples - FROM labeldb_labels l, labeldb_samples s - WHERE s.label_id = l.id ORDER BY 1", - &[], - )?; - for e in &row { - let id: LabelId = e.get("id"); - labels.push(id); - labels_map.insert( - id, - Label::new( - id, - e.get("label"), - e.get("description"), - Modified::No, - e.get("refers"), - e.get("samples"), - Vec::new(), - ), - ); - } - - let row = self.conn.query( - "SELECT label_id, type, pattern_id, token, use, prob FROM labeldb_tokens ORDER BY 1,2", - &[], - )?; - for e in &row { - let lab_id: LabelId = e.get("label_id"); - let pattern_id: usize = usize::from_i32(e.get("pattern_id")).unwrap_or(0); - let tokentype = - PatternType::from_i32(e.get("type")).map_or_else(PatternType::default, |t| t); - let pt = Pattern::new( - pattern_id, - tokentype, - e.get("token"), - e.get("use"), - e.get("prob"), - ); - if let Some(lab) = labels_map.get_mut(&lab_id) { - lab.add_pattern(pt); - } - } - - Ok((labels, labels_map)) - } - - /// # Errors - /// - /// Will return `Err` if same label name already exist or query fails by connection failure. - pub fn save_label(&mut self, lab: &Label) -> Result { - let (_, name, desc, _, refers, samples) = lab.fields(); - let _ = self - .conn - .execute( - "INSERT INTO labeldb_labels (label, refers, description) VALUES ($1, $2, $3)", - &[&name, &refers, &desc], - ) - .context(format!("insert new label \"{}\"", name))?; - - let row = self - .conn - .query_one("SELECT id FROM labeldb_labels WHERE label = $1", &[&name]) - .context(format!("get label id of \"{}\"", name))?; - let id: LabelId = row.get("id"); - - let _ = self - .conn - .execute( - "INSERT INTO labeldb_samples (label_id, samples) VALUES ($1, $2)", - &[&id, &samples], - ) - .context(format!("insert samples of label \"{}\"", name))?; - - let q = self.conn.prepare_typed( - "INSERT INTO labeldb_tokens (label_id, type, pattern_id, token, use, prob) - VALUES ($1, $2, $3, $4, $5, $6)", - &[ - Type::INT4, - Type::INT4, - Type::INT4, - Type::TEXT_ARRAY, - Type::BOOL, - Type::FLOAT8, - ], - )?; - - for (tt, pattern_id, tokens, useful, prob) in lab.patterns_fields() { - let tt_i32 = tt.to_i32().unwrap_or(0); - let pid_i32 = pattern_id.to_i32().unwrap_or(0); - let _ = self - .conn - .execute(&q, &[&id, &tt_i32, &pid_i32, &tokens, &useful, &prob]) - .context(format!("insert tokens for label \"{}\"", name))?; - } - Ok(id) - } - - /// # Errors - /// - /// Will return `Err` if query fails by connection failure. - pub fn remove_label(&mut self, lab_id: i32) -> Result<()> { - let _ = self - .conn - .execute("DELETE FROM labeldb_match WHERE label_id = $1", &[&lab_id])?; - let _ = self.conn.execute( - "DELETE FROM labeldb_samples WHERE label_id = $1", - &[&lab_id], - )?; - let _ = self - .conn - .execute("DELETE FROM labeldb_tokens WHERE label_id = $1", &[&lab_id])?; - let _ = self - .conn - .execute("DELETE FROM labeldb_labels WHERE id = $1", &[&lab_id])?; - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if query fails by connection failure. - pub fn update_label(&mut self, lab_id: LabelId, tokens: &[Pattern]) -> Result { - // currently only possible to update token/keywords/signature - // TODO: update labeldb_labels, labeldb_samples - let _ = self - .conn - .execute("DELETE FROM labeldb_tokens WHERE label_id = $1", &[&lab_id])?; - let q = self.conn.prepare_typed( - "INSERT INTO labeldb_tokens (label_id, type, pattern_id, token, use, prob) VALUES ($1, $2, $3, $4, $5, $6)", - &[Type::INT4, Type::INT4, Type::INT4, Type::TEXT_ARRAY, Type::BOOL, Type::FLOAT8], - )?; - - let mut cnt = 0_u64; - for tok in tokens { - let (pid, tokentype, tokens, useful, prob) = tok.fields(); - let pid_i32 = pid.to_i32().unwrap_or(0); - let tt_i32 = tokentype.to_i32().unwrap_or(0); - cnt += self - .conn - .execute(&q, &[&lab_id, &tt_i32, &pid_i32, &tokens, &useful, &prob]) - .context(format!("update label tokens of #{}", lab_id))?; - } - - Ok(cnt) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or `labeldb_disables` table does not exist. - #[allow(clippy::type_complexity)] - pub fn get_disables(&mut self) -> Result<(HashMap, HashMap>)> { - let mut dict: HashMap = HashMap::new(); - let mut dict_rev: HashMap> = HashMap::new(); - let row = self - .conn - .query("SELECT id, tokens FROM labeldb_disables ORDER BY 1", &[]) - .context("read dictionary")?; - for e in &row { - let id: i32 = e.get("id"); - let tokens: Vec = e.get("tokens"); - for tok in &tokens { - if !dict.contains_key(tok) { - dict.insert(tok.to_string(), id); - } - } - dict_rev.entry(id).or_insert(tokens); - } - Ok((dict, dict_rev)) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or `labeldb_disables` table does not exist. - pub fn update_disables( - &mut self, - dict_rev: &HashMap>, - updated: &[i32], - ) -> Result<()> { - for id in updated { - if let Some(toks) = dict_rev.get(id) { - if toks.is_empty() { - let _ = self - .conn - .execute("DELETE FROM labeldb_disables WHERE id = $1", &[&id])?; - } else { - let _ = self.conn.execute( - "UPDATE labeldb_disables SET tokens = $1 WHERE id = $2", - &[toks, &id], - )?; - } - } - } - - let mut start: usize = 0; - let mut end: usize = DB_UPDATE_IN_A_QUERY; - - if let Some(newtoks) = dict_rev.get(&0) { - loop { - if end > newtoks.len() { - end = newtoks.len(); - } - - let tv: Vec<_> = newtoks[start..end].to_vec(); - let _ = self - .conn - .execute("INSERT INTO labeldb_disables (tokens) VALUES ($1)", &[&tv]) - .context("add new words to dictionary")?; - - if end >= newtoks.len() { - break; - } - - start = end; - end += DB_UPDATE_IN_A_QUERY; - } - } - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or `labeldb_match` table does not exist. - pub fn save_match_result( - &mut self, - model_id: i32, - objs: &ClusterMatchUp, - ) -> Result<(u64, u64)> { - let _ = self - .conn - .execute("DELETE FROM labeldb_match WHERE model_id=$1", &[&model_id])?; - - let q_match = self.conn.prepare( - "INSERT INTO labeldb_match (model_id, cluster_id, label_id, pattern_ids, prob) - VALUES ($1, $2, $3, $4, $5)", - )?; - - let mut cnt_match = 0_u64; - let mut update_list: HashMap> = HashMap::new(); - for (cid, mu) in objs { - for (lab_id, (score, pid_list)) in mu.fields() { - //&mu.matched { - let pids_i32: Vec = pid_list - .iter() - .map(|pid| i32::try_from(*pid).unwrap_or(0)) - .collect(); - cnt_match += self - .conn - .execute(&q_match, &[&model_id, cid, &lab_id, &pids_i32, &score])?; - } - - let qid = mu.qualifier().to_i32().unwrap_or(2); // 2 => Qualifier::Unknown - if let Some(v) = update_list.get_mut(&qid) { - v.push(*cid); - } else { - update_list.insert(qid, vec![*cid]); - } - } - - let cnt_q = self.save_cluster_qualifier(&update_list, model_id)?; - - Ok((cnt_match, cnt_q)) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or `labeldb_match` table does not exist. - pub fn get_match_result(&mut self, model_id: i32) -> Result { - let q = self.conn.prepare( - "SELECT m.cluster_id, c.qualifier_id, m.label_id, m.pattern_ids, COALESCE(m.prob, 0) AS prob - FROM labeldb_match m, cluster c - WHERE m.model_id = $1 AND c.model_id = m.model_id AND c.id = m.cluster_id", - )?; - - let mut rst: ClusterMatchUp = HashMap::new(); - let row = self.conn.query(&q, &[&model_id])?; - for e in &row { - let cluster_id: ClusterId = e.get("cluster_id"); - let label_id: LabelId = e.get("label_id"); - let qualifier = Qualifier::from_i32(e.get("qualifier_id")).unwrap_or_default(); - let pattern_ids: Vec = e.get("pattern_ids"); - let prob: f64 = e.get("prob"); - - let pids: Vec<_> = pattern_ids - .iter() - .map(|i| usize::try_from(*i).unwrap_or(0)) - .collect(); - if let Some(mu) = rst.get_mut(&cluster_id) { - mu.append_matched(label_id, pids, prob); - } else { - rst.insert(cluster_id, MatchUp::new(qualifier, label_id, prob, pids)); - } - } - Ok(rst) - } - - /// # Errors - /// - /// Will return `Err` if postgres connection fails or - /// `labeldb_tokens_usage` table does not exist. - pub fn save_labeldb_tokens_usage( - &mut self, - model_id: i32, - usage: &HashMap<(PatternType, String), Vec>, - ) -> Result { - self.conn.execute( - "DELETE FROM labeldb_tokens_usage WHERE model_id=$1", - &[&model_id], - )?; - let q = self.conn.prepare( - "INSERT INTO labeldb_tokens_usage (model_id, type, pattern, usage) - VALUES ($1, $2, $3, JSON_OBJECT($4, $5))", - )?; - let mut cnt = 0_u64; - for ((tt, pattern), q_cnts) in usage { - let t = tt.to_i32().unwrap_or(0); - let mut q_array: Vec = Vec::new(); - let mut c_array: Vec = Vec::new(); - q_cnts.iter().enumerate().for_each(|(i, cnt)| { - if i < MAX_QUALIFIERS { - q_array.push(format!("{:?}", ORDERED_QUALIFIERS[i])); - c_array.push(cnt.to_string()); - } - }); - cnt += self - .conn - .execute(&q, &[&model_id, &t, pattern, &q_array, &c_array])?; - } - - Ok(cnt) - } - - /// # Errors - /// - /// Will return `Err` if `labeldb_tokens_usage` table does not exist. - pub fn get_labeldb_tokens_usage( - &mut self, - model_id: i32, - ) -> Result>> { - let mut usage: HashMap<(PatternType, String), Vec> = HashMap::new(); - let q = self.conn.prepare( - "SELECT type, pattern, (usage->>'Benign')::int AS benign, - (usage->>'Unknown')::int AS unknown, - (usage->>'Suspicious')::int AS suspicious, - (usage->>'Mixed')::int AS mixed - FROM labeldb_tokens_usage WHERE model_id=$1", - )?; - let row = self.conn.query(&q, &[&model_id])?; - for e in &row { - let ptype = PatternType::from_i32(e.get("type")).unwrap_or_default(); - let pattern: String = e.get("pattern"); - let benign = usize::from_i32(e.get("benign")).unwrap_or(0); - let unknown = usize::from_i32(e.get("unknown")).unwrap_or(0); - let suspicious = usize::from_i32(e.get("suspicious")).unwrap_or(0); - let mixed = usize::from_i32(e.get("mixed")).unwrap_or(0); - let mut v: Vec = vec![0; MAX_QUALIFIERS]; - ORDERED_QUALIFIERS.iter().enumerate().for_each(|(i, q)| { - v[i] = match q { - Qualifier::Benign => benign, - Qualifier::Unknown => unknown, - Qualifier::Suspicious => suspicious, - Qualifier::Mixed => mixed, - }; - }); - usage.insert((ptype, pattern), v); - } - Ok(usage) - } -} diff --git a/src/dict.rs b/src/dict.rs deleted file mode 100644 index bfc0860..0000000 --- a/src/dict.rs +++ /dev/null @@ -1,102 +0,0 @@ -use crate::database::AutoDb; -use anyhow::Result; -use std::collections::HashMap; -use std::fmt; - -#[derive(Default)] -pub struct Dictionary { - dict: HashMap, - dict_rev: HashMap>, - updated: Vec, -} - -impl fmt::Display for Dictionary { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (idx, dict) in &self.dict_rev { - writeln!(f, "{}: {:?}", idx, dict)?; - } - Ok(()) - } -} - -impl Dictionary { - #[must_use] - pub fn find(&self, token: &str) -> bool { - self.dict.contains_key(token) - } - - pub fn add(&mut self, word: &str) { - //println!("Dictionary: add \"{}\"", word); - if self.dict.contains_key(word) { - return; - } - - self.dict.insert(word.to_string(), 0); - - if let Some(v) = self.dict_rev.get_mut(&0) { - v.push(word.to_string()); - } else { - self.dict_rev.insert(0, vec![word.to_string()]); - } - } - - pub fn remove(&mut self, word: &str) { - //println!("Dictionary: remove \"{}\"", word); - if self.dict.contains_key(word) { - let id = self.dict.remove(word); - if let Some(i) = id { - if let Some(v) = self.dict_rev.get_mut(&i) { - v.retain(|w| w != word); - } - - if !self.updated.contains(&i) { - self.updated.push(i); - } - } - } - } - - /// # Errors - /// - /// will return `Err` if it fails to read `labeldb_disables` - pub fn load(&mut self, db: &mut AutoDb) -> Result<()> { - let (dict, dict_rev) = db.get_disables()?; - self.dict = dict; - self.dict_rev = dict_rev; - Ok(()) - } - - /// # Errors - /// - /// will return `Err` if it fails to update `labeldb_disables` - pub fn save(&mut self, db: &mut AutoDb) -> Result<()> { - db.update_disables(&self.dict_rev, &self.updated)?; - let delete_list: Vec = self - .updated - .iter() - .filter_map(|id| { - self.dict_rev - .get(id) - .and_then(|v| if v.is_empty() { Some(*id) } else { None }) - }) - .collect(); - for id in &delete_list { - self.dict_rev.remove(id); - } - self.updated.clear(); - if let Some(v) = self.dict_rev.get_mut(&0) { - v.clear(); - } - Ok(()) - } - - /* - pub fn collect(&mut self, db: &mut AutoDB, model_name: &str) -> Result<()> { - // collect word from datasource model - } - - pub fn export(&self, filename: &str) -> Result<()> { - // save dictionary to file - } - */ -} diff --git a/src/events.rs b/src/events.rs index 174fabc..496d591 100644 --- a/src/events.rs +++ b/src/events.rs @@ -1,438 +1,109 @@ -use crate::database::{AutoDb, Datasource}; -use crate::weblog_parser::{ - detect_weblogtype, http_reponse_status, tokenize_weblog, HttpFields, HttpStatus, TokensVector, -}; -use crate::{DataType, DatetimeFormat, Modified}; - -#[cfg(all(target_arch = "x86_64", feature = "hyperscan"))] -use anyhow::Context; -use anyhow::Result; +use crate::config::Config; +use crate::{parser, MessageId}; +use anyhow::{anyhow, Result}; +use log::info; use regex::Regex; -use std::collections::HashMap; -//use std::sync::mpsc; -//use std::sync::Arc; -//use threadpool::ThreadPool; -#[cfg(all(target_arch = "x86_64", feature = "hyperscan"))] -use hyperscan::prelude::*; -use rayon::prelude::*; -#[cfg(not(all(target_arch = "x86_64", feature = "hyperscan")))] -use regex::RegexSet; -#[cfg(not(all(target_arch = "x86_64", feature = "hyperscan")))] -use std::convert::TryFrom; -use std::sync::{Arc, RwLock}; - -const EVENTS_PER_THREAD: usize = 5000; - -pub type MessageId = i64; -pub type MessageList = Vec<(MessageId, String)>; - -#[derive(Default, Clone)] -pub struct MessageStatus { - modified: Modified, - message_ids: Vec, -} - -impl MessageStatus { - #[must_use] - pub fn new(modified: Modified, message_ids: Vec) -> Self { - MessageStatus { - modified, - message_ids, - } - } - - #[must_use] - pub fn is_new(&self) -> bool { - self.modified == Modified::New - } - - #[must_use] - pub fn is_modified(&self) -> bool { - self.modified == Modified::Yes - } - - #[must_use] - pub fn message_ids(&self) -> &[MessageId] { - &self.message_ids - } -} +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::io::{BufRead, BufReader}; #[derive(Default, Clone)] pub struct Message { - id: MessageId, - modified: Modified, + _id: MessageId, content: String, - fields: Vec, tokens: Vec, } -impl Message { - #[must_use] - pub fn new( - id: MessageId, - modified: Modified, - content: String, - fields: Vec, - tokens: Vec, - ) -> Self { - Message { - id, - modified, - content, - fields, - tokens, - } - } - - #[must_use] - pub fn is_new(&self) -> bool { - self.modified == Modified::New - } - - fn get_number_field(&self, idx: usize) -> usize { - if self.fields.len() > idx { - self.fields[idx].parse::().unwrap_or(0) - } else { - 0 - } - } - - fn get_text_field(&self, idx: usize) -> Option { - if self.fields.len() > idx { - Some(self.fields[idx].to_string()) - } else { - None - } - } - - #[must_use] - pub fn fields(&self) -> (MessageId, Modified, &str, &[String], &[String]) { - ( - self.id, - self.modified, - &self.content, - &self.fields, - &self.tokens, - ) - } -} - #[derive(Default, Clone)] -pub struct EventTokens { +pub struct Events { events: HashMap, - tokens_events_map: HashMap, MessageStatus>, - events_ipaddr_map: HashMap, - events_httpstatus_map: HashMap, + // tokens_events_map: HashMap, Vec>, + // outliers: Vec, } -impl EventTokens { - #[must_use] - pub fn is_empty(&self) -> bool { - self.events.is_empty() - } - - #[must_use] - pub fn httplog_statuscodes(&self, cluster_events: &[MessageId]) -> Vec<(HttpStatus, usize)> { - let mut status_map: HashMap = HashMap::new(); - for id in cluster_events { - let v = if let Some(v) = self.events_httpstatus_map.get(id) { - *v - } else if let Some(evt) = self.events.get(id) { - let code = evt.get_number_field(HttpFields::Status as usize); - http_reponse_status(code) +impl Events { + /// # Panics + /// * if `key_column` field does not find in column format aliases + /// + /// # Errors + /// + /// Will return Err if it fails to open events file. + pub fn new(cfg: &Config, event_ids: Vec) -> Result { + let key_idx = cfg + .key_field() + .ok_or_else(|| anyhow!("key_field does not set"))?; + let features = cfg.features(); + let column_len = cfg.column_len(); + let delimiter = cfg.delimiter(); + let event_ids: HashSet = event_ids.into_iter().collect(); + + let file = File::open(cfg.events())?; + let lines = BufReader::new(file).lines(); + let mut events = HashMap::new(); + let mut skipped = 0; + let mut notfound = 0; + for line in lines.flatten() { + let log: Vec<_> = line.split(delimiter).collect(); + if log.len() != column_len { + skipped += 1; + continue; + } + let key = if let Some(key) = log.get(key_idx) { + if event_ids.contains(*key) { + key + } else { + notfound += 1; + continue; + } } else { + notfound += 1; continue; }; - if let Some(cnt) = status_map.get_mut(&v) { - *cnt += 1; - } else { - status_map.insert(v, 1); - } - } - status_map.iter().map(|(st, cnt)| (*st, *cnt)).collect() - } - - pub fn make_events_httpstatus_map(&mut self) { - let mut events_httpstatus_map: HashMap = HashMap::new(); - self.events.iter().for_each(|(msgid, evt)| { - let status = evt.get_number_field(HttpFields::Status as usize); - if status > 0 { - events_httpstatus_map.insert(*msgid, http_reponse_status(status)); - } - }); - self.events_httpstatus_map = events_httpstatus_map; - } - - #[must_use] - pub fn httplog_remoteaddrs(&self, cluster_events: &[MessageId]) -> Vec { - let mut ipaddr: Vec = cluster_events - .iter() - .filter_map(|id| { - self.events_ipaddr_map.get(id).map_or_else( - || { - self.events - .get(id) - .and_then(|evt| evt.get_text_field(HttpFields::RemoteAddr as usize)) - }, - |v| Some(v.to_string()), - ) - }) - .collect(); - ipaddr.sort(); - ipaddr.dedup(); - ipaddr - } - - pub fn make_events_ipaddr_map(&mut self) { - let mut events_ipaddr_map: HashMap = HashMap::new(); - self.events.iter().for_each(|(msgid, evt)| { - if let Some(ip) = evt.get_text_field(HttpFields::RemoteAddr as usize) { - events_ipaddr_map.insert(*msgid, ip); - } - }); - self.events_ipaddr_map = events_ipaddr_map; - } - - #[allow(dead_code)] - fn get_events_localtime( - &self, - event_ids: &[MessageId], - datetype: DatetimeFormat, - ) -> Vec<(MessageId, i64)> { - let mut localtimes = Vec::<(MessageId, i64)>::new(); - for msgid in event_ids { - if let Some(evt) = self.events.get(msgid) { - if let Some(t) = evt.get_text_field(HttpFields::TimeLocal as usize) { - localtimes.push((*msgid, datetype.parse_datetime(&t))); + let mut tokens = Vec::new(); + for feature_idx in &features { + if let Some(value) = log.get(*feature_idx) { + tokens.extend(parser::extract_tokens(value)); } } - } - localtimes + events.insert( + (*key).to_string(), + Message { + _id: (*key).to_string(), + content: line, + tokens, + }, + ); + } + info!("{} skipped events, {} not found", skipped, notfound); + + // let mut tokens_events_map: HashMap, Vec> = HashMap::new(); + // for (id, msg) in &events { + // tokens_events_map + // .entry(msg.tokens.clone()) + // .and_modify(|message_ids| message_ids.push(id.to_string())) + // .or_insert(vec![id.to_string()]); + // } + + Ok(Self { + events, + // tokens_events_map, + // outliers: Vec::new(), + }) } #[must_use] - pub fn make_tokens_events_map( - &self, - event_ids: &[MessageId], - ) -> HashMap> { - let mut event_tokens: HashMap> = HashMap::new(); - for msgid in event_ids { - if let Some(evt) = self.events.get(msgid) { - evt.tokens.iter().for_each(|tok| { - if let Some(vm) = event_tokens.get_mut(tok) { - vm.push(*msgid); - } else { - event_tokens.insert(tok.to_string(), vec![*msgid]); - } - }); - } - } - for vm in event_tokens.values_mut() { - vm.sort_unstable(); - vm.dedup(); - } - event_tokens - } - - /// # Errors - /// - /// Will return `Err` if it fails to save tokens to `autodb_tokens` table. - pub fn save(&self, db: &mut AutoDb, model_id: i32, force: bool) -> Result { - db.save_autodb_tokens(model_id, &self.events, &self.tokens_events_map, force) - } - - /// # Errors - /// - /// Will return `Err` if it fails to read `event` or `autodb_tokens` table. - pub fn load_events_and_tokenize( - &mut self, - db: &mut AutoDb, - model: &mut Datasource, - force: bool, - token_fields: &[usize], - ) -> Result<()> { - let (id, _, model_logtype, _) = model.fields(); - self.events = db.get_raw_events(id, force)?; - log::info!("model #{}: {} events are loaded.", id, self.events.len()); - - if !force { - let tokens_events_map = db.get_autodb_tokens(id)?; - for (toks, mss) in &tokens_events_map { - mss.message_ids.iter().for_each(|msgid| { - if let Some(evt) = self.events.get_mut(msgid) { - evt.tokens = toks.clone(); - } - }); - } - if !tokens_events_map.is_empty() { - log::info!( - "model #{}: {} tokens are loaded.", - id, - tokens_events_map.len() - ); - } - self.tokens_events_map = tokens_events_map; - } - - let (logtype, datetype) = self.detect_logtype(model_logtype); - log::info!("log type={:?}, datetime type={}", logtype, datetype); - - model.set(logtype, datetype); - self.tokenize_raw_events(logtype, token_fields); - - Ok(()) - } - - // the log type cannot be defined for some message like following sample: - // 0535136 - - [28/Nov/2017:20:36:40 +0900] "-" 408 - "-" "-" - fn choose_longest_message(&self, limit: usize) -> String { - let mut longest: &str = ""; - for (_, evt) in self.events.iter().take(limit) { - if evt.content.len() > longest.len() { - longest = &evt.content; - } - } - longest.to_string() + pub fn is_empty(&self) -> bool { + self.events.is_empty() } #[must_use] - pub fn detect_logtype(&self, dt: DataType) -> (DataType, DatetimeFormat) { - // TODO: add another data type: CSV, Packet - let mut logtype: DataType = DataType::default(); - let mut datetype: DatetimeFormat = DatetimeFormat::default(); - let s = self.choose_longest_message(100); - log::debug!("Choosed message: {}", s); - if let DataType::Log(_) = dt { - if let Ok((x, y)) = detect_weblogtype(&s) { - logtype = DataType::Log(x); - datetype = y; - } - } - (logtype, datetype) + pub fn len(&self) -> usize { + self.events.len() } - pub fn tokenize_raw_events(&mut self, logtype: DataType, target: &[usize]) { - let mut objs: Vec> = Vec::new(); - self.events - .iter() - .filter(|(_, msg)| msg.fields.is_empty()) - .for_each(|(_, msg)| { - if let Some(last) = objs.last_mut() { - if last.len() < EVENTS_PER_THREAD { - last.push((msg.id, msg.content.to_string())); - } else { - objs.push(vec![(msg.id, msg.content.to_string())]); - } - } else { - objs.push(vec![(msg.id, msg.content.to_string())]); - } - }); - - if objs.is_empty() { - return; - } - - let store: Arc>> = Arc::new(RwLock::new(Vec::new())); - let p_store = Arc::clone(&store); - if let DataType::Log(logformat) = logtype { - objs.par_iter().for_each(|o| { - //let target = vec![HttpFields::Request as usize, HttpFields::Referer as usize]; - match tokenize_weblog(o, logformat, target) { - Ok(tokens) => { - let mut p = p_store.write().expect("no other writer"); - p.push(tokens); - } - Err(e) => log::error!("{}", e), - } - }); - if let Ok(s) = p_store.read() { - s.iter().for_each(|vtoks| { - for (msgid, fields, tokens) in vtoks { - if !fields.is_empty() { - if let Some(mss) = self.tokens_events_map.get_mut(tokens) { - if mss.modified != Modified::New { - mss.modified = Modified::Yes; // for update - } - mss.message_ids.push(*msgid); - } else { - self.tokens_events_map.insert( - tokens.clone(), - MessageStatus { - modified: Modified::New, // for insert - message_ids: vec![*msgid], - }, - ); - } - if let Some(evt) = self.events.get_mut(msgid) { - evt.modified = Modified::New; - evt.fields = fields.clone(); - evt.tokens = tokens.clone(); - } - } - } - }); - } - } - /* - let (tx, rx) = mpsc::channel(); - let pool = ThreadPool::new(num_cpus::get()); - objs.into_iter().for_each(|o| { - let arc_event: Arc> = Arc::new(o); - let tx = tx.clone(); - let lt = logtype; - let dt = datetype; - pool.execute(move || { - if let DataType::Log(x) = lt { - // extract tokens from request and referer fields of web log - let target = vec![HttpFields::Request as usize, HttpFields::Referer as usize]; - let tokens = tokenize_weblog(&arc_event, x, dt, &target); - if let Err(e) = tx.send(tokens) { - log::error!("{:?}", e); - } - } - }); - }); - - drop(tx); - for rst in rx.iter() { - for (message_id, fields, tokens) in rst { - if !fields.is_empty() { - if let Some(mss) = self.tokens_events_map.get_mut(&tokens) { - if mss.modified != Modified::New { - mss.modified = Modified::Yes; // for update - } - mss.message_ids.push(message_id); - } else { - self.tokens_events_map.insert( - tokens.to_vec(), - MessageStatus { - modified: Modified::New, // for insert - message_ids: vec![message_id], - }, - ); - } - if let Some(evt) = self.events.get_mut(&message_id) { - evt.modified = Modified::New; - evt.fields = fields; - evt.tokens = tokens; - } - } - } - } - */ - - let cnt = self - .events - .iter() - .filter(|(_, evt)| evt.modified == Modified::New) - .count(); - let cnt_empty_token = self - .events - .iter() - .filter(|(_, evt)| evt.modified == Modified::New && evt.tokens.is_empty()) - .count(); - log::info!("{} messages are newly tokenized.", cnt); - log::info!("{} messages have empty tokens.", cnt_empty_token); + #[must_use] + pub fn tokens(&self, message_id: &MessageId) -> Option<&Vec> { + self.events.get(message_id).map(|m| &m.tokens) } #[must_use] @@ -447,52 +118,10 @@ impl EventTokens { false } - /// # Errors - /// - /// Will return `Err` if the hyerpscan scan library returns error. - #[cfg(all(target_arch = "x86_64", feature = "hyperscan"))] - pub fn regex_match_hyperscan( - &self, - db: &BlockDatabase, - scratch: &Scratch, - ) -> Result> { - let mut rst: Vec<(MessageId, u32, u32)> = Vec::new(); - for (msgid, msg) in &self.events { - db.scan(&msg.content, scratch, |id, _, _, _| { - rst.push((*msgid, id >> 8, id & 0xFF)); - Matching::Continue - }) - .with_context(|| "scanning")?; - } - Ok(rst) - } - - #[cfg(not(all(target_arch = "x86_64", feature = "hyperscan")))] #[must_use] - pub fn regex_set_match(&self, db: &RegexSet) -> Vec<(MessageId, u32, u32)> { + pub fn get_message(&self, message_id: &MessageId) -> Option<&str> { self.events - .iter() - .flat_map(|(msgid, msg)| { - db.matches(&msg.content).into_iter().map(move |m| { - ( - *msgid, - u32::try_from(m >> 8).expect("overflow ids"), - u32::try_from(m & 0xFF).expect("overflow ids"), - ) - }) - }) - .collect() - } - - pub fn show_samples(&self, event_ids: &[MessageId], csv_style: bool) { - for msgid in event_ids { - if let Some(evt) = self.events.get(msgid) { - if csv_style { - println!("{}", evt.fields.join(", ")); - } else { - println!("{}", evt.content); - } - } - } + .get(message_id) + .map(|message| message.content.as_str()) } } diff --git a/src/labelmap.rs b/src/labelmap.rs deleted file mode 100644 index 5050c3d..0000000 --- a/src/labelmap.rs +++ /dev/null @@ -1,734 +0,0 @@ -use crate::database::{AutoDb, Datasource}; -use crate::dict::Dictionary; -use crate::labels::{Label, Pattern}; -use crate::threat_description::{Load, Save, TdDb, ThreatDescription}; -use crate::{ - bold, hashmap, qualifiers_header, ClusterId, LabelId, Modified, PatternId, PatternType, - Qualifier, MAX_QUALIFIERS, ORDERED_QUALIFIERS, -}; - -use ansi_term::Style; -use anyhow::{anyhow, Result}; -use chrono::{DateTime, Utc}; -use itertools::Itertools; -use std::collections::HashMap; -use std::fmt; - -pub struct LabelMatchUp { - // clusters and matched patterns for a label - // vector for each qualifier values: benign/suspicious/unknown/mixed - matched: Vec>>, -} - -impl Default for LabelMatchUp { - fn default() -> Self { - LabelMatchUp { - matched: vec![HashMap::new(); MAX_QUALIFIERS], - } - } -} - -impl LabelMatchUp { - #[must_use] - pub fn new(qualifier: Qualifier, cid: ClusterId, pid_list: Vec) -> Self { - let tmp = hashmap![cid => pid_list]; - - let mut matched = vec![HashMap::new(); MAX_QUALIFIERS]; - if let Some(pos) = ORDERED_QUALIFIERS.iter().position(|q| *q == qualifier) { - matched[pos] = tmp; - } - - LabelMatchUp { matched } - } - - pub fn append_matched(&mut self, q: Qualifier, cid: ClusterId, pid_list: &[PatternId]) { - // add clusters and matched patterns data for each label - if let Some(pos) = ORDERED_QUALIFIERS.iter().position(|oq| *oq == q) { - if let Some(v) = self.matched[pos].get_mut(&cid) { - v.extend(pid_list); - } else { - self.matched[pos].insert(cid, pid_list.to_vec()); - } - } - } - - pub fn print(&self, tokens: &[Pattern]) { - println!("Matched clusters:"); - ORDERED_QUALIFIERS.iter().enumerate().for_each(|(pos, q)| { - if self.matched[pos].is_empty() { - return; - } - let mut r: Vec = Vec::new(); - self.matched[pos].iter().for_each(|(cid, pid_list)| { - let mut rst: HashMap> = HashMap::new(); - for pt in tokens { - let (pid, tokentype, _, useful, _) = pt.fields(); - if useful && pid_list.contains(&pid) { - if let Some(v) = rst.get_mut(&tokentype) { - v.push(pt.pattern()); - } else { - rst.insert(tokentype, vec![pt.pattern()]); - } - } - } - for (tokentype, tokvec) in &rst { - r.push(format!( - "\t{:>6}: {} - {}", - cid, - tokentype, - tokvec.join(", ") - )); - } - }); - if !r.is_empty() { - println!(" {}: {} clusters", q, r.len()); - for s in &r { - println!("{}", s); - } - } - }); - println!(); - } - - #[must_use] - pub fn get_matched_count_by_qualifier(&self) -> Vec { - let mut rst: Vec = Vec::new(); - self.matched.iter().for_each(|matched_clusters| { - if matched_clusters.is_empty() { - rst.push(0); - } else { - rst.push(matched_clusters.len()); - } - }); - rst - } -} - -#[derive(Default)] -pub struct TopLabels { - model: Option, - labels: Vec, - labels_map: HashMap, - labels_name_map: HashMap, // label name vs label id - tokens: HashMap>, - tokens_usage: HashMap<(PatternType, String), Vec>, - matchup_result: HashMap, - dict: Dictionary, - layers: Vec>, -} - -impl fmt::Display for TopLabels { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for lab_id in &self.labels { - if let Some(lab) = self.labels_map.get(lab_id) { - writeln!(f, "{}", lab)?; - } - } - - writeln!( - f, - "\nnumber of labels : {}, number of label tokens = {}", - self.labels.len(), - self.tokens.len() - ) - } -} - -impl TopLabels { - /// # Errors - /// - /// Will return `Err` if postgres connection fails or it fails to init `labeldb_*` tables - /// or it fails to query `data_source` and `model`. - pub fn init(dburl: &str, model_name: Option<&str>) -> Result { - // initialize labels related things - // - // 1. init database if not exist - // 2. load labels from labeldb_labels, labeldb_samples, labeldb_tokens - // 3. load dictionary from labeldb_disables table - // only if `model_name` is specified, - // 4. load model data from model, data_source table - // 5. load tokens usage for model from labeldb_tokens_usage table - // 6. load label matching result for model from labeldb_match table - // 7. init filtering layer - // 8. make keywords and signatures map data for labeling - let mut db = AutoDb::new(dburl)?; - let mut toplabs = TopLabels::default(); - db.initialize_labeldb()?; - toplabs.load_labels(&mut db)?; - toplabs.dict.load(&mut db)?; - if let Some(t) = model_name { - let model = db.get_datasource(t)?; - let (id, name, _, _) = model.fields(); - log::info!("tokens usage from #{} \"{}\"", id, name); - toplabs.load_tokens_usage(&mut db, id)?; - toplabs.load_matchup_result(&mut db, id)?; - toplabs.model = Some(model); - } else { - toplabs.model = None; - }; - toplabs.layers.push(toplabs.labels.clone()); - - Ok(toplabs) - } - - /// # Errors - /// - /// will return `Err` if it fails to read `labeldb_labels` or `labeldb_tokens` - pub fn load_labels(&mut self, db: &mut AutoDb) -> Result { - let (labels, labels_map) = db.get_labels()?; - self.labels = labels; - self.labels_map = labels_map; - - let mut labels_name_map: HashMap = HashMap::new(); - let mut tokens: HashMap> = HashMap::new(); - self.labels_map.iter().for_each(|(_, lab)| { - lab.add_tokens(PatternType::Token, &mut tokens); - lab.add_label_name(&mut labels_name_map); - }); - self.labels_name_map = labels_name_map; - self.tokens = tokens; - Ok(self.labels.len()) - } - - /// # Errors - /// - /// will return `Err` if it fails to update `labeldb_disables` table. - pub fn save_dict(&mut self, dburl: &str) -> Result<()> { - // save all changes of dictionary and load it again. - let mut db = AutoDb::new(dburl)?; - self.save_and_reload_dictionary(&mut db, true, true) - } - - /// # Errors - /// - /// will return `Err` if it fails to connect database or query failes to select from `labeldb_disables` table. - pub fn load_dict(&mut self, db: &mut AutoDb) -> Result<()> { - self.save_and_reload_dictionary(db, false, true) - } - - /// # Errors - /// - /// will return `Err` if it fails to update `labeldb_disables` table. - pub fn save_and_reload_dictionary( - &mut self, - db: &mut AutoDb, - save: bool, - load: bool, - ) -> Result<()> { - if save { - log::info!("saving dictionary ..."); - self.dict.save(db)?; - } - - if load { - log::info!("loading dictionary ..."); - self.dict.load(db)?; - } - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if it fails to query `labeldb_tokens_usage` table. - pub fn load_tokens_usage(&mut self, db: &mut AutoDb, model_id: i32) -> Result<()> { - self.tokens_usage = db.get_labeldb_tokens_usage(model_id)?; - log::info!( - "{} tokens usage records are loaded.", - self.tokens_usage.len() - ); - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if it fails to query `labeldb_match` table. - pub fn load_matchup_result(&mut self, db: &mut AutoDb, model_id: i32) -> Result<()> { - // load label matchup result from database and convert the cluster-label map to label-cluster - // labtune: `/label #` command - let cluster_matched = db.get_match_result(model_id)?; - - let mut rst: HashMap = HashMap::new(); - for (cid, mu) in &cluster_matched { - for (lab_id, (_, pid_list)) in mu.fields() { - if let Some(labmu) = rst.get_mut(&lab_id) { - labmu.append_matched(mu.qualifier(), *cid, pid_list); - } else { - rst.insert( - lab_id, - LabelMatchUp::new(mu.qualifier(), *cid, pid_list.to_vec()), - ); - } - } - } - self.matchup_result = rst; - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if it fails to update `labeldb_labels`, `labeldb_samples`, `labeldb_tokens` tables. - pub fn save_labels(&mut self, dburl: &str, force: bool) -> Result<()> { - let mut db = AutoDb::new(dburl)?; - self.save_and_update_labels(&mut db, force) - } - - /// # Errors - /// - /// Will return `Err` if it fails to update `labeldb_labels`, `labeldb_samples`, `labeldb_tokens` tables. - pub fn save_and_update_labels(&mut self, db: &mut AutoDb, _force: bool) -> Result<()> { - // Save labels that the labels have any modified things. - let mut deleted: Vec<(LabelId, String)> = Vec::new(); - for (lab_id, lab) in &mut self.labels_map { - match lab.save(db) { - Ok((prev_modified, _)) => { - let name = lab.label_name(); - match prev_modified { - Modified::WillRemoved => { - println!("Removed ... #{}: {}", lab_id, name); - deleted.push((*lab_id, name.to_string())); - } - Modified::Yes => println!("Updated ... #{}: {}", lab_id, name), - _ => {} - } - } - Err(e) => return Err(anyhow!("save label #{}. {:?}", lab_id, e)), - } - } - - for (lab_id, name) in &deleted { - self.labels.retain(|id| *id != *lab_id); - self.labels_name_map.remove(name); - self.matchup_result.remove(lab_id); - let patterns: Vec<_> = if let Some(lab) = self.labels_map.get(lab_id) { - lab.patterns() // .tokens.iter().map(Pattern::pattern).collect() - } else { - vec![] - }; - for p in &patterns { - if let Some(lp_vec) = self.tokens.get_mut(p) { - lp_vec.retain(|(i, _)| *i != *lab_id); - if lp_vec.is_empty() { - self.tokens.remove(p); - } - } - } - self.labels_map.remove(lab_id); - self.layers.iter_mut().for_each(|lab_vec| { - lab_vec.retain(|id| *id != *lab_id); - }); - } - - Ok(()) - } - - pub fn remove_all_labels(&mut self) -> usize { - self.labels_map.iter_mut().for_each(|(_, lab)| { - lab.set_modified(Modified::WillRemoved); - }); - self.labels_map.len() - } - - pub fn remove_labels(&mut self, lab_id: LabelId) -> bool { - self.labels_map.get_mut(&lab_id).map_or(false, |lab| { - lab.set_modified(Modified::WillRemoved); - true - }) - } - - #[must_use] - pub fn len(&self) -> usize { - self.layers.last().map_or(0, Vec::len) - } - - #[must_use] - pub fn is_empty(&self) -> bool { - self.layers.last().map_or(true, Vec::is_empty) - } - - /// # Errors - /// - /// Will return `Err` if a try to remove on an empty filter - pub fn remove_filter(&mut self) -> Result<()> { - if self.layers.len() > 1 { - self.layers.pop(); - Ok(()) - } else { - Err(anyhow!("Empty filter.")) - } - } - - pub fn filter_by(&mut self, tt: PatternType, pattern: &str) -> usize { - if tt != PatternType::Token { - return 0; - } - - let mut v = self.tokens.get(pattern).map_or_else(Vec::new, |lp_vec| { - lp_vec.iter().map(|(lab_id, _)| *lab_id).collect() - }); - v.sort_unstable(); - v.dedup(); - let len = v.len(); - self.layers.push(v); - len - } - - #[must_use] - pub fn filter_by_useful( - &self, - lab_id: LabelId, - pid_list: &[PatternId], - flag: bool, - ) -> Vec { - if let Some(lab) = self.labels_map.get(&lab_id) { - lab.filter_by_useful(pid_list, flag) - } else { - vec![] - } - } - - pub fn list_labels(&self, pageno: usize, pagesize: usize) { - let begin = pageno * pagesize; - let mut end = begin + pagesize; - if let Some(last) = self.layers.last() { - if last.len() < end { - end = last.len(); - } - - let v = last.get(begin..end); - println!("[No] #Label-ID: [Modified] Name"); - println!("--------------------------------------- "); - if let Some(vv) = v { - vv.iter().enumerate().for_each(|(i, lab_id)| { - if let Some(lab) = self.labels_map.get(lab_id) { - println!("[{}] {}", begin + i, lab.listing()); - } - }); - } - } - } - - #[must_use] - pub fn show_labelid(&self, opt: &str) -> Option { - if opt.starts_with('#') { - if let Some(id) = opt.get(1..) { - if let Ok(lab_id) = id.parse::() { - if let Some(lab) = self.labels_map.get(&lab_id) { - lab.print_label(&self.tokens_usage, self.matchup_result.get(&lab_id)); - return Some(lab_id); - } - } - } - } else if let Ok(idx) = opt.parse::() { - if let Some(last) = self.layers.last() { - if idx < last.len() { - let lab_id = last[idx]; - if let Some(lab) = self.labels_map.get(&lab_id) { - lab.print_label(&self.tokens_usage, self.matchup_result.get(&lab_id)); - return Some(lab_id); - } - } - } - } - None - } - - fn show_tokens_usage(&self) { - let mut enabled: Vec<(PatternType, &str, &[usize])> = Vec::new(); - let mut disabled: Vec<(PatternType, &str, &[usize])> = Vec::new(); - - self.tokens_usage - .iter() - .for_each(|((pt, pattern), q_cnts)| { - if self.dict.find(pattern) { - disabled.push((*pt, pattern, q_cnts)); - } else { - enabled.push((*pt, pattern, q_cnts)); - } - }); - - let header = qualifiers_header().join(" "); - if !enabled.is_empty() { - enabled.sort_by(|a, b| a.2.cmp(b.2)); - println!("{} {} tokens", bold!("Matched tokens:"), enabled.len()); - println!(" {}", header); - for (pt, t, q_cnts) in enabled { - println!("\t{}\t[{}] {}", q_cnts.iter().join("\t"), pt, t); - } - } - - if !disabled.is_empty() { - disabled.sort_by(|a, b| a.2.cmp(b.2)); - println!("\n{} {} tokens", bold!("Disabled tokens:"), disabled.len()); - println!(" {}", header); - for (pt, t, q_cnts) in disabled { - println!("\t{}\t{} - {}", q_cnts.iter().join("\t"), pt, t); - } - } - println!(); - } - - fn show_matched_labels(&self) { - let mut rst: Vec<(LabelId, Vec)> = Vec::new(); - self.matchup_result.iter().for_each(|(lab_id, mc)| { - rst.push((*lab_id, mc.get_matched_count_by_qualifier())); - }); - if !rst.is_empty() { - rst.sort_by(|a, b| a.1.cmp(&b.1)); - println!("{} {} labels", bold!("Matched labels:"), rst.len()); - println!(" {}", qualifiers_header().join(" ")); - for (lab_id, cnts) in &rst { - if let Some(lab) = self.labels_map.get(lab_id) { - println!( - "\t{}\t{} - {}", - cnts.iter().join("\t"), - lab_id, - lab.label_name() - ); - } - } - println!(); - } - } - - pub fn show_status(&self) { - println!( - "Total {} tokens from {} labels.", - self.tokens.len(), - self.labels_map.len() - ); - if self.model.is_some() { - println!("{} labels are matched.", self.matchup_result.len()); - println!("{} tokens are matched.", self.tokens_usage.len()); - } - println!(); - if !self.matchup_result.is_empty() { - self.show_matched_labels(); - } - if !self.tokens_usage.is_empty() { - self.show_tokens_usage(); - } - } - - #[must_use] - pub fn label_name(&self, lab_id: LabelId) -> Option<&str> { - self.labels_map.get(&lab_id).map(Label::label_name) - } - - pub fn show_dict(&self) { - println!("{}", self.dict); - } - - pub fn enable_disable_tokens( - &mut self, - lab_id: LabelId, - tokens: &[&str], - tt: PatternType, - flag: bool, - ) -> usize { - let mut words: Vec<&str> = Vec::new(); - if lab_id > 0 { - if let Some(lab) = self.labels_map.get_mut(&lab_id) { - words = lab.relocate_pattern(tokens, tt, flag); - } else { - println!("label #{} not found.", lab_id); - return 0; - }; - } else { - for lab in self.labels_map.values_mut() { - let w = lab.relocate_pattern(tokens, tt, flag); - for t in &w { - if !words.contains(t) { - words.push(t); - } - } - } - } - for tok in &words { - if flag { - self.dict.remove(tok); - } else { - self.dict.add(tok); - } - } - words.len() - } - - #[must_use] - pub fn only_regex_patterns(&self) -> Vec { - // make pattern map for label keywords and signatures - self.labels_map - .iter() - .flat_map(|(_, lab)| lab.regex_signatures()) - .collect() - } - - #[must_use] - pub fn types_and_patterns( - &self, - lab_id: LabelId, - pid_list: &[PatternId], - ) -> Vec<(PatternType, String)> { - if let Some(lab) = self.labels_map.get(&lab_id) { - lab.types_and_patterns(pid_list) - } else { - vec![] - } - } - - #[must_use] - pub fn find_pattern(&self, pattern: &str) -> Option<&Vec<(LabelId, PatternId)>> { - self.tokens.get(pattern) - } - - pub fn relocate( - &mut self, - tt: PatternType, - pattern: &str, - flag: bool, - ) -> Option<(usize, usize)> { - let tokens = vec![pattern]; - let ptn_list: Vec<_> = self - .tokens - .get_mut(pattern)? - .iter() - .map(|(lab_id, _)| *lab_id) - .collect(); - let mut cnt = 0_usize; - for lab_id in &ptn_list { - if self.enable_disable_tokens(*lab_id, &tokens, tt, flag) > 0 { - cnt += 1; - } - } - Some((ptn_list.len(), cnt)) - } - - /// # Errors - /// - /// Will return `Err` if the command try to add/remove tokens not keywords, nor signature. - /// Or the pattern id have invalid index. - pub fn update_pattern( - &mut self, - lab_id: LabelId, - pt: PatternType, - pattern: &str, - add: bool, - ) -> Result<()> { - match pt { - PatternType::Keywords | PatternType::Signature => { - if let Some(lab) = self.labels_map.get_mut(&lab_id) { - lab.update_pattern(pt, pattern, add)?; - } - } - _ => return Err(anyhow!("Tokens cannot be modified")), - } - Ok(()) - } - - #[must_use] - pub fn calculate_score(&self, lab_id: LabelId, pid_list: &[PatternId]) -> f64 { - self.labels_map - .get(&lab_id) - .map_or(0.0, |lab| lab.scores(pid_list)) - } - - fn add_new_labels( - &mut self, - _force: bool, - dburl: &str, - td_list: &[ThreatDescription], - ) -> Result { - let mut db = AutoDb::new(dburl)?; - let mut cnt = 0_usize; - for td in td_list { - // TODO: apply force option - let name = td.name(); - if self.labels_name_map.contains_key(name) { - println!("DUP! \"{}\"", name); - continue; - } - - let mut lab = td.label_from_threat_description(); - lab.init_tokens(&self.dict); - let (_, lab_id) = match lab.save(&mut db) { - Ok(x) => x, - Err(e) => { - println!("failed to load \"{}\". {:?}", name, e); - continue; - } - }; - - // update labels, labels_map, tokens list - lab.add_tokens(PatternType::Token, &mut self.tokens); - self.labels.push(lab_id); - self.labels_name_map.insert(name.to_string(), lab_id); - if let Some(first) = self.layers.first_mut() { - first.push(lab_id); - } - self.labels_map.insert(lab_id, lab); - - println!("label \"{}\" loaded.", name); - cnt += 1; - } - - Ok(cnt) - } - - /// # Errors - /// - /// Will return `Err` if failed to connect database - /// or failed to open json file (like permission error) - /// or the data have invalid json format. - pub fn load_labels_from_files( - &mut self, - force: bool, - dburl: &str, - path: &str, - ) -> Result { - println!("import labels from \"{}\"", path); - let adlist = ThreatDescription::read_threat_description(path)?; - self.add_new_labels(force, dburl, &adlist) - } - - /// # Errors - /// - /// Will return `Err` if failed to connect database, - /// or failed to open json file (like permission error), - /// or the data have invalid json format. - pub fn import(&mut self, force: bool, dburl: &str, path: &str) -> Result { - println!("import threat database from \"{}\"", path); - let tdb = TdDb::from_path(path)?; - self.add_new_labels(force, dburl, tdb.contents()) - } - - pub fn export_labels(&self, lab_list: &[LabelId]) { - for lab_id in lab_list { - if let Some(lab) = self.labels_map.get(lab_id) { - let ad = lab.export(); - let mut filename = lab.label_name().to_string().replace(' ', "_"); - filename.push_str(".json"); - match ad.save(&filename) { - Ok(_) => println!("label #{} exported to \"{}\"", lab_id, filename), - Err(e) => println!("label #{} failed to export. {:?}", lab_id, e), - } - } - } - } - - pub fn export_tdb(&self) { - let now: DateTime = Utc::now(); - let version = now.format("%Y%m%d%H%M%S"); - let filename = format!("label_dump_{}.tdb", version); - let threats: Vec<_> = self - .labels_map - .iter() - .map(|(_, lab)| lab.export()) - .collect(); - - let tdb = TdDb::new(version.to_string(), threats); - - match tdb.save(&filename) { - Ok(_) => println!("labels are exported to \"{}\"", filename), - Err(e) => log::error!("label export failed. {:?}", e), - } - } -} diff --git a/src/labels.rs b/src/labels.rs index ce21a51..97d77a8 100644 --- a/src/labels.rs +++ b/src/labels.rs @@ -1,523 +1,150 @@ -use crate::database::AutoDb; -use crate::dict::Dictionary; -use crate::labelmap::LabelMatchUp; -use crate::threat_description::ThreatDescription; -use crate::{qualifiers_header, ubold, LabelId, Modified, PatternId, PatternType}; +use crate::{config::Load, ClusterId, MessageId, PatternId, RuleId, Score, TidbId}; +use anyhow::Result; +use serde::Deserialize; +use std::collections::{HashMap, HashSet}; + +type RepresentativeLabels = Vec<(ClusterId, Vec<(TidbId, RuleId, usize, Score)>)>; +type EventLabels = Vec<(ClusterId, Vec<(MessageId, Vec<(TidbId, RuleId, Score)>)>)>; +type ClusterByEvents = HashMap)>>; + +#[derive(Deserialize)] +#[allow(unused)] +struct DebugLabels { + representative_labels: usize, + event_labels: usize, + representative: RepresentativeLabels, + events: EventLabels, +} -use ansi_term::Style; -use anyhow::{anyhow, Result}; -use itertools::Itertools; -use std::collections::HashMap; -use std::convert::TryFrom; -use std::fmt; +impl Load for DebugLabels {} -#[derive(Default)] -pub struct Pattern { - id: PatternId, - tokentype: PatternType, - tokens: Vec, - useful: bool, - prob: f64, +pub struct Labels { + clusters_labels_map: HashMap>, + clusters_events_map: ClusterByEvents, + labels_clusters_map: HashMap>, + representative: RepresentativeLabels, + events: EventLabels, } -impl fmt::Display for Pattern { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.tokentype == PatternType::Token || self.tokentype == PatternType::Signature { - for tok in &self.tokens { - write!(f, "{:.3}: {}", self.prob, tok)?; +impl Labels { + pub fn new(path: &str) -> Result { + let debug_labels = DebugLabels::from_path(path)?; + let mut clusters_labels_map: HashMap> = HashMap::new(); + let mut clusters_events_map: ClusterByEvents = HashMap::new(); + let mut labels_clusters_map: HashMap> = HashMap::new(); + for (cluster_id, events) in &debug_labels.events { + clusters_events_map + .entry(*cluster_id) + .or_insert_with(|| events.clone()); + for (_, v) in events { + for (tidb_id, rule_id, _) in v { + clusters_labels_map + .entry(*cluster_id) + .and_modify(|labels| labels.push((*tidb_id, *rule_id))) + .or_insert_with(|| vec![(*tidb_id, *rule_id)]); + labels_clusters_map + .entry((*tidb_id, *rule_id)) + .and_modify(|clusters| clusters.push(*cluster_id)) + .or_insert_with(|| vec![*cluster_id]); + } } - } else { - write!(f, "{:.3}: {:?}", self.prob, self.tokens)?; } - Ok(()) - } -} - -impl Pattern { - #[must_use] - pub fn new( - id: PatternId, - tokentype: PatternType, - tokens: Vec, - useful: bool, - prob: f64, - ) -> Self { - Pattern { - id, - tokentype, - tokens, - useful, - prob, + for patterns in clusters_labels_map.values_mut() { + patterns.sort_by(|a, b| { + let aa = (u64::from(a.0) << 32) | u64::from(a.1); + let bb = (u64::from(b.0) << 32) | u64::from(b.1); + aa.cmp(&bb) + }); + patterns.dedup(); } - } - - pub fn set_useful(&mut self, flag: bool) { - self.useful = flag; - } - - #[must_use] - pub fn fields(&self) -> (PatternId, PatternType, &[String], bool, f64) { - ( - self.id, - self.tokentype, - &self.tokens, - self.useful, - self.prob, - ) - } - - #[must_use] - pub fn pattern(&self) -> String { - match self.tokentype { - PatternType::Token | PatternType::Signature => self - .tokens - .first() - .map_or_else(String::new, ToString::to_string), - PatternType::Keywords => self.tokens.join(","), - PatternType::White => String::new(), + for clusters in labels_clusters_map.values_mut() { + clusters.sort_unstable(); } - } - #[must_use] - pub fn type_and_pattern(&self, pid_list: &[PatternId]) -> Option<(PatternType, String)> { - if pid_list.contains(&self.id) { - Some((self.tokentype, self.pattern())) - } else { - None - } + Ok(Self { + clusters_labels_map, + clusters_events_map, + labels_clusters_map, + representative: debug_labels.representative, + events: debug_labels.events, + }) } - #[must_use] - pub fn filter_by_useful(&self, pid_list: &[PatternId], flag: bool) -> Option { - if self.useful == flag && (pid_list.is_empty() || pid_list.contains(&self.id)) { - Some(self.id) + pub fn get_representative_labels( + &self, + cluster_id: ClusterId, + ) -> Option<&Vec<(TidbId, RuleId, usize, Score)>> { + if let Some((_, labels)) = self + .representative + .iter() + .find(|(cid, _)| *cid == cluster_id) + { + Some(labels) } else { None } } - #[must_use] - pub fn regex_signatures(&self, lab_id: LabelId) -> Option { - // Max number of tokens and keywords and signatures in a label is 256(1 Byte, 0xFF). - // Hyperscan regex syntax: id:/expression/flag - // Hyperscan compile flag: 'L' => SOM_LEFTMOST flag - // Pattern id: - // - Not to duplicate id, the label id and serial number of keywords and signature is re-mapped. - // - expected label id range is 100_000 ~ 999_999 (6-digit number) - // - re-mapping: labed id shifted left by 8-bit + serial index number of keywords/signatures(1Byte) - // Because of this remapping, the max number of tokens/keywords/signature is 255. - // regex syntax: - // - signature will be used as it without modification. - // - keywords will be escaped the special characters of regex syntax and joined with ".+" - - let pattern_id = if let Ok(id) = u32::try_from(lab_id) { - if let Ok(no) = u32::try_from(self.id) { - id << 8 | (no & 0xFF) - } else { - return None; - } - } else { - return None; - }; - - match self.tokentype { - PatternType::Signature => self - .tokens - .first() - .map(|first| format!("{}:/{}/iL", pattern_id, first)), - PatternType::Keywords => { - let esc: Vec<_> = self.tokens.iter().map(|s| regex::escape(s)).collect(); - Some(format!("{}:/{}/iL", pattern_id, esc.join(".+"))) - } - _ => None, - } - } - - #[must_use] - fn enable_disable_token(&mut self, tt: PatternType, token: &str, flag: bool) -> bool { - if self.tokentype == tt && token == self.pattern() && self.useful != flag { - self.set_useful(flag); - true - } else { - false - } - } -} - -#[derive(Default)] -pub struct Label { - id: LabelId, - name: String, - description: Option, - modified: Modified, - references: Option>, - samples: Vec, - tokens: Vec, -} - -impl fmt::Display for Label { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let title = if self.modified == Modified::New { - "Label #______ (New)".to_string() - } else { - format!("Label #{}", self.id) - }; - write!(f, "{}", ubold!(title))?; - - let name = ubold!(&self.name); - writeln!(f, " [{}]\n{}\n", self.modified, name)?; - - if let Some(s) = self.description.as_ref() { - writeln!(f, "{}:\n{}\n", ubold!("Description"), s)?; - } - - if let Some(r) = self.references.as_ref() { - writeln!(f, "{}:\n\t{:?}\n", ubold!("References"), r)?; - } - if !self.samples.is_empty() { - writeln!(f, "{}:", ubold!("Samples"))?; - for s in &self.samples { - writeln!(f, "\t{}", s)?; - } - } - Ok(()) - } -} - -type LabelFields<'a> = ( - LabelId, - &'a str, - Option<&'a str>, - Modified, - Option>, - &'a [String], -); -impl Label { - #[must_use] - pub fn new( - id: LabelId, - name: String, - description: Option, - modified: Modified, - references: Option>, - samples: Vec, - tokens: Vec, - ) -> Self { - Label { - id, - name, - description, - modified, - references, - samples, - tokens, - } - } - - pub fn set_modified(&mut self, modified: Modified) { - self.modified = modified; - } - - #[must_use] - pub fn label_name(&self) -> &str { - &self.name - } - - #[must_use] - pub fn fields(&self) -> LabelFields { - let refers = self.references.as_ref().cloned(); - - ( - self.id, - &self.name, - self.description.as_deref(), - self.modified, - refers, - &self.samples, - ) - } - - #[must_use] - pub fn patterns_fields(&self) -> Vec<(PatternId, PatternType, &[String], bool, f64)> { - self.tokens.iter().map(Pattern::fields).collect() - } - - #[must_use] - pub fn export(&self) -> ThreatDescription { - let mut signature: Vec = Vec::new(); - let mut keywords: Vec> = Vec::new(); - self.tokens.iter().for_each(|ptn| match ptn.tokentype { - PatternType::Signature => signature.push(ptn.pattern()), - PatternType::Keywords => keywords.push(ptn.tokens.clone()), - _ => {} - }); - let description = self - .description - .as_ref() - .unwrap_or(&String::new()) - .to_string(); - let references = self.references.as_ref().unwrap_or(&Vec::new()).clone(); - ThreatDescription::new( - self.name.to_string(), - description, - references, - self.samples.clone(), - keywords, - signature, - ) - } - - #[must_use] - pub fn regex_signatures(&self) -> Vec { - self.tokens - .iter() - .filter_map(|ptn| ptn.regex_signatures(self.id)) - .collect() - } - - #[must_use] - pub fn patterns(&self) -> Vec { - self.tokens.iter().map(Pattern::pattern).collect() - } - - #[must_use] - pub fn types_and_patterns(&self, pid_list: &[PatternId]) -> Vec<(PatternType, String)> { - self.tokens - .iter() - .filter_map(|ptn| ptn.type_and_pattern(pid_list)) - .collect() - } - - pub fn add_pattern(&mut self, ptn: Pattern) { - self.tokens.push(ptn); - } - - #[must_use] - pub fn filter_by_useful(&self, pid_list: &[PatternId], flag: bool) -> Vec { - self.tokens - .iter() - .filter_map(|pt| pt.filter_by_useful(pid_list, flag)) - .collect() - } - - /// Remove a pattern from the pattern list of this label. - /// - /// # Errors - /// - /// Will return `Err` if the specified pattern not found or try to remove read-only tokens. - fn remove_pattern(&mut self, pid: PatternId, tt: PatternType) -> Result<()> { - if tt == PatternType::Keywords || tt == PatternType::Signature { - if let Some(idx) = self.tokens.iter().position(|ptn| ptn.id == pid) { - if self.tokens[idx].tokentype == tt { - self.tokens.remove(idx); + pub fn get_event_labels(&self, cluster_id: ClusterId) -> Option> { + let mut patterns = HashMap::new(); + if let Some(v) = self.clusters_events_map.get(&cluster_id) { + for (_, vv) in v { + for (tidb_id, rule_id, _) in vv { + patterns + .entry((*tidb_id, *rule_id)) + .and_modify(|c| *c += 1) + .or_insert(1_usize); } - } else { - return Err(anyhow!("pattern id not found.")); - } - } else { - return Err(anyhow!("invalid token type.")); - } - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if the command try to add/remove tokens not keywords, nor signature. - /// Or the pattern id have invalid index. - pub fn update_pattern(&mut self, tt: PatternType, pattern: &str, add: bool) -> Result<()> { - // process `labtune` CLI commands: `/add keywords|signature`, or `/remove keywords|signature` - // (작업중) process `labtune` threat description file: add tokens|keywords|signatures - if add { - let tokens = match tt { - PatternType::Keywords => pattern.split(',').map(ToString::to_string).collect(), - PatternType::Signature => vec![pattern.to_string()], - _ => return Err(anyhow!("invalid token type.")), - }; - let pattern_id = self - .tokens - .iter() - .max_by(|a, b| a.id.cmp(&b.id)) - .map_or(0, |x| x.id) - + 1; - self.add_pattern(Pattern::new(pattern_id, tt, tokens, true, tt.weight())); - } else if let Ok(pattern_id) = pattern.parse::() { - self.remove_pattern(pattern_id, tt)?; - } else { - return Err(anyhow!("invalid pattern id.")); - } - - if self.modified == Modified::No { - self.set_modified(Modified::Yes); - } - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if it fails to update `labeldb_labels`, - /// `labeldb_match`, `labeldb_samples`, `labeldb_tokens` tables. - pub fn save(&mut self, db: &mut AutoDb) -> Result<(Modified, LabelId)> { - let old_modified = self.modified; - match old_modified { - Modified::WillRemoved => { - db.remove_label(self.id)?; - self.set_modified(Modified::Deleted); - } - Modified::Yes => { - db.update_label(self.id, &self.tokens)?; - self.set_modified(Modified::No); } - Modified::New => { - self.id = db.save_label(self)?; - self.set_modified(Modified::No); - } - _ => {} } - Ok((old_modified, self.id)) - } - - pub fn init_tokens(&mut self, dict: &Dictionary) { - self.tokens.iter_mut().for_each(|ptn| { - if ptn.tokentype == PatternType::Token && dict.find(&ptn.pattern()) { - ptn.set_useful(false); - } + let mut patterns: Vec<_> = patterns.into_iter().collect(); + patterns.sort_by(|a, b| { + let aa = (u64::from(a.0 .0) << 32) | u64::from(a.0 .1); + let bb = (u64::from(b.0 .0) << 32) | u64::from(b.0 .1); + aa.cmp(&bb) }); - } - - pub fn add_tokens( - &self, - tt: PatternType, - dest: &mut HashMap>, - ) { - // add the tokens of this label to tokens list - self.tokens - .iter() - .filter(|ptn| ptn.tokentype == tt) // ignore useful flag. see calculate_score() - .for_each(|ptn| { - let p = ptn.pattern(); - if let Some(vlab) = dest.get_mut(&p) { - vlab.push((self.id, ptn.id)); - } else { - dest.insert(p, vec![(self.id, ptn.id)]); - } - }); - } - - pub fn add_label_name(&self, names_map: &mut HashMap) { - if names_map.contains_key(&self.name) { - log::error!( - "label name duplicated. discard #{} \"{}\"", - self.id, - self.name - ); - return; + if patterns.is_empty() { + None + } else { + Some(patterns) } - - names_map.insert(self.name.to_string(), self.id); } - fn print_tokens( - &self, - title: &str, - tt: PatternType, - usage: &HashMap<(PatternType, String), Vec>, - flag: bool, - ) { - println!("{}:", ubold!(title)); - let mut v: Vec<_> = self - .tokens + /// Calculate + /// * the number of labeled clustgers + /// * the number of distinct labeled events + /// * the number of representative labels + pub fn statistics(&self) -> (usize, usize, usize) { + let labeled_clusters = self.clusters_labels_map.len(); + let labeled_events: HashSet = self + .events .iter() - .filter_map(|lt| { - if lt.tokentype == tt && lt.useful == flag { - let cnts = usage - .get(&(tt, lt.pattern())) - .map_or_else(|| String::from("-\t-\t-\t-"), |v| v.iter().join("\t")); - Some((cnts, lt.id, lt.prob, lt.pattern())) - } else { - None - } + .flat_map(|(_, v)| { + v.iter() + .map(|(vv, _)| vv.to_string()) + .collect::>() }) .collect(); - if !v.is_empty() { - println!(" {}", qualifiers_header().join(" ")); - v.sort_by(|a, b| a.1.cmp(&b.1)); - for (cnts, id, w, p) in &v { - println!("\t{}\t#{} {:>3.0}: {}", cnts, id, w, p); - } - } - println!(); - } + let representatives = self.representative.len(); - pub fn print_label( - &self, - usage: &HashMap<(PatternType, String), Vec>, - matched: Option<&LabelMatchUp>, - ) { - // labtune: detail label information. `/label #` command - println!("{}", self); - if !self.tokens.is_empty() { - self.print_tokens("Keywords", PatternType::Keywords, usage, true); - self.print_tokens("Signature", PatternType::Signature, usage, true); - self.print_tokens("Tokens(enabled)", PatternType::Token, usage, true); - self.print_tokens("Tokens(disabled)", PatternType::Token, usage, false); - } - if let Some(mu) = matched { - mu.print(&self.tokens); - } + (labeled_clusters, labeled_events.len(), representatives) } - #[must_use] - pub fn listing(&self) -> String { - match self.modified { - Modified::New => format!("#______: [{}] {}", self.modified, self.name), - Modified::Deleted => format!("#______: [{}]", self.modified), - _ => format!("#{}: [{}] {}", self.id, self.modified, self.name), - } - } - - pub fn relocate_pattern<'a>( - &mut self, - tokens: &[&'a str], - tt: PatternType, - flag: bool, - ) -> Vec<&'a str> { - let mut cnt = 0_usize; - let mut dict: Vec<&str> = Vec::new(); - for tok in tokens { - for ptn in &mut self.tokens { - if ptn.enable_disable_token(tt, tok, flag) { - cnt += 1; - - if tt == PatternType::Token && !dict.contains(tok) { - dict.push(tok); - } - } + pub fn find_clusters(&self, tidb_id: TidbId, rule_id: RuleId) -> Vec { + let mut found = Vec::new(); + for (pattern_id, clusters) in &self.labels_clusters_map { + if (pattern_id.0 == tidb_id || tidb_id == 0) + && (pattern_id.1 == rule_id || rule_id == 0) + { + found.extend(clusters); } } - if cnt > 0 { - self.set_modified(Modified::Yes); - } - - dict + found.sort_unstable(); + found.dedup(); + found } - #[must_use] - pub fn scores(&self, pid_list: &[PatternId]) -> f64 { - // calculate scores for matched label patterns - // - only enabled patterns are calculated. - let mut score = 0.0_f64; - let mut scanned = pid_list.len(); - for ptn in &self.tokens { - if pid_list.contains(&ptn.id) { - if ptn.useful { - score += ptn.prob; - } - scanned -= 1; - if scanned == 0 { - break; - } - } - } - score + pub fn is_labeled(&self, cluster_id: ClusterId) -> bool { + self.clusters_labels_map.contains_key(&cluster_id) } } diff --git a/src/lib.rs b/src/lib.rs index f0c93e1..e4b694e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,104 +1,58 @@ -pub mod cluster; -pub mod database; -pub mod dict; -pub mod events; -pub mod labelmap; -pub mod labels; +mod cluster; +pub mod config; +mod events; +mod labels; pub mod matcher; -pub mod threat_description; -pub mod weblog_parser; +mod parser; +mod tidb; use ansi_term::Colour; use anyhow::Result; -use chrono::DateTime; use num_derive::{FromPrimitive, ToPrimitive}; -use regex::Regex; -use std::fmt; -use std::str::FromStr; - -pub type ClusterId = i32; -pub type LabelId = i32; -pub type PatternId = usize; - -// Web log type -// LogType: CommonWeb -// 192.168.10.100 - - [18/Apr/2019:16:22:00 +0900] "GET /jbnu/oasis/common/comResultCnt.xfdl.js HTTP/1.1" 200 885 -// r#"^(\d+\.\d+\.\d+\.\d+) - - \[(\d+/[a-zA-Z]+/\d{4}:\d{2}:\d{2}:\d{2} [\+\-]\d{4})\] "([A-Z]+) (.+\s)HTTP/(\d)\.(\d)" (\d+) ([0-9\-]+)"# -// LogType: CombinedWeb -// 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" "Mozilla/4.08 [en] (Win98; I ;Nav)" -// r#"^(\d+\.\d+\.\d+\.\d+) - - \[(\d+/[a-zA-Z]+/\d{4}:\d{2}:\d{2}:\d{2} [\+\-]\d{4})\] "([A-Z]+) (.+\s)HTTP/(\d)\.(\d)" (\d+) ([0-9\-]+)" "(.+)" "(.+)"# -// LogType: IPConvertedCommonWeb -// IP345678 - - [01/Apr/2018:13:35:16 +0900] "GET /css/font/NanumBarunGothicBold.woff HTTP/1.1" 200 2195716 -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum WeblogType { - Common, - Combined, - Unknown, -} +use serde::{Deserialize, Serialize}; +use strum::EnumIter; -impl fmt::Display for WeblogType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - WeblogType::Common => write!(f, "Apache common log"), - WeblogType::Combined => write!(f, "Apache combined log"), - WeblogType::Unknown => write!(f, "Unknown log format"), - } - } -} +pub type ClusterId = usize; +pub type Score = f32; +pub type TidbId = u32; +pub type RuleId = u32; +pub type PatternId = (TidbId, RuleId); +pub type MessageId = String; +pub type TokensVector = Vec<(MessageId, Vec, Vec)>; /* Datasource data type */ -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum DataType { +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EventType { Csv, - Log(WeblogType), + Log, Packet, - Unknown, } -impl Default for DataType { +impl Default for EventType { fn default() -> Self { - DataType::Unknown + EventType::Csv } } -impl FromStr for DataType { - type Err = (); - fn from_str(input: &str) -> Result { - match input { - "csv" => Ok(DataType::Csv), - "log" => Ok(DataType::Log(WeblogType::Unknown)), - "packet" => Ok(DataType::Packet), - _ => Err(()), +impl std::fmt::Display for EventType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + EventType::Csv => write!(f, "csv"), + EventType::Log => write!(f, "log"), + EventType::Packet => write!(f, "packet"), } } } -// Datetime format definition. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum DatetimeFormat { - // RFC2822 : Tue, 1 Jul 2003 10:52:37 +0200 or Wed, 18 Feb 2015 23:16:09 GMT - Rfc2822, - // RFC3339 : 1996-12-19T16:39:57-08:00 - Rfc3339, - // RFC3339Opts : 2018-01-26T18:30:09.453Z or 2018-01-26T18:30:09Z - // Apache log datetime format : 18/Apr/2019:16:22:00 +0900 - Apache, - Unknown, -} - -impl Default for DatetimeFormat { - fn default() -> Self { - DatetimeFormat::Unknown - } -} - -impl fmt::Display for DatetimeFormat { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - DatetimeFormat::Rfc2822 => write!(f, "RFC2822"), - DatetimeFormat::Rfc3339 => write!(f, "RFC3339"), - DatetimeFormat::Apache => write!(f, "ApacheLog"), - DatetimeFormat::Unknown => write!(f, "Unknown"), +impl std::str::FromStr for EventType { + type Err = (); + fn from_str(input: &str) -> Result { + match input { + "csv" => Ok(EventType::Csv), + "log" => Ok(EventType::Log), + "packet" => Ok(EventType::Packet), + _ => Err(()), } } } @@ -106,7 +60,7 @@ impl fmt::Display for DatetimeFormat { #[macro_export] macro_rules! bold { ($x:expr) => { - Style::new().bold().paint($x) + ansi_term::Style::new().bold().paint($x) }; } @@ -154,65 +108,9 @@ macro_rules! hashmap { }} } -#[must_use] -pub fn detect_datetime_format(dt: &str) -> DatetimeFormat { - if DateTime::parse_from_rfc2822(dt).is_ok() { - return DatetimeFormat::Rfc2822; - } else if DateTime::parse_from_rfc3339(dt).is_ok() { - return DatetimeFormat::Rfc3339; - } else if let Ok(re) = - Regex::new(r"(\d+)/([a-zA-Z]+)/(\d{4}):(\d{2}):(\d{2}):(\d{2}) ([\+\-]\d{4})") - { - if re.captures_len() > 0 { - return DatetimeFormat::Apache; - } - } - DatetimeFormat::Unknown -} - -impl DatetimeFormat { - #[allow(dead_code)] - fn parse_datetime(self, s: &str) -> i64 { - match self { - DatetimeFormat::Rfc2822 => { - if let Ok(dt) = DateTime::parse_from_rfc2822(s) { - return dt.timestamp(); - } - } - DatetimeFormat::Rfc3339 => { - if let Ok(dt) = DateTime::parse_from_rfc3339(s) { - return dt.timestamp(); - } - } - DatetimeFormat::Apache => { - // 18/Apr/2019:16:22:00 +0900 => 2019 Apr 18 16:22:00 +0900 - if let Ok(re) = Regex::new( - r"(?P\d+)/(?P[a-zA-Z]+)/(?P\d{4}):(?P\d{2}):(?P\d{2}):(?P\d{2}) (?P[\+\-]\d{4})", - ) { - if let Some(cap) = re.captures(s) { - let dt = format!( - "{} {} {} {}:{}:{} {}", - &cap["year"], - &cap["month"], - &cap["day"], - &cap["hour"], - &cap["min"], - &cap["sec"], - &cap["zone"] - ); - if let Ok(d) = DateTime::parse_from_str(&dt, "%Y %b %d %H:%M:%S %z") { - return d.timestamp(); - } - } - } - } - DatetimeFormat::Unknown => {} - } - 0 - } -} - -#[derive(Debug, Clone, Copy, Eq, FromPrimitive, ToPrimitive, Hash, Ord, PartialOrd, PartialEq)] +#[derive( + Debug, Clone, Copy, EnumIter, Eq, FromPrimitive, ToPrimitive, Hash, Ord, PartialOrd, PartialEq, +)] pub enum Qualifier { Benign = 1, Unknown = 2, @@ -220,6 +118,7 @@ pub enum Qualifier { Mixed = 4, } +pub const MAX_QUALIFIERS: usize = 4; pub const ORDERED_QUALIFIERS: [Qualifier; 4] = [ Qualifier::Benign, Qualifier::Unknown, @@ -227,7 +126,22 @@ pub const ORDERED_QUALIFIERS: [Qualifier; 4] = [ Qualifier::Mixed, ]; -pub const MAX_QUALIFIERS: usize = 4; +#[derive(Default)] +pub struct QualifierCount { + count: [usize; ORDERED_QUALIFIERS.len()], +} + +impl std::fmt::Display for QualifierCount { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ORDERED_QUALIFIERS.iter().enumerate().for_each(|(i, q)| { + if i > 0 { + let _ = write!(f, ", "); + } + let _ = write!(f, "{:?} = {}", q, self.count[i]); + }); + write!(f, "") + } +} #[must_use] pub fn qualifiers_header() -> Vec { @@ -240,7 +154,7 @@ impl Default for Qualifier { } } -impl FromStr for Qualifier { +impl std::str::FromStr for Qualifier { type Err = (); fn from_str(value: &str) -> Result { match value { @@ -253,8 +167,8 @@ impl FromStr for Qualifier { } } -impl fmt::Display for Qualifier { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for Qualifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Qualifier::Benign => write!(f, "{}", boldgreen!("benign")), Qualifier::Unknown => write!(f, "unknown"), @@ -264,85 +178,44 @@ impl fmt::Display for Qualifier { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum QualifiedBy { - ClusterSize, - HttpStatus, - IPaddr, - Label, -} - -impl Default for QualifiedBy { - fn default() -> Self { - QualifiedBy::Label - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Modified { - No, - Yes, - New, - WillRemoved, - Deleted, -} - -impl Default for Modified { - fn default() -> Self { - Modified::No - } -} - -impl fmt::Display for Modified { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Modified::No => write!(f, "Not modified"), - Modified::Yes => write!(f, "{}", blue!("Modified")), - Modified::New => write!(f, "New"), - Modified::WillRemoved => write!(f, "{}", red!("Will be removed")), - Modified::Deleted => write!(f, "Deleted"), - } - } -} - #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum FilterOp { L, - Le, + LE, G, - Ge, - Eq, - Ne, + GE, + EQ, + NE, } impl Default for FilterOp { fn default() -> Self { - FilterOp::Eq + FilterOp::EQ } } -impl fmt::Display for FilterOp { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for FilterOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { FilterOp::L => write!(f, "<"), FilterOp::G => write!(f, ">"), - FilterOp::Le => write!(f, "<="), - FilterOp::Ge => write!(f, ">="), - FilterOp::Eq => write!(f, "="), - FilterOp::Ne => write!(f, "<>"), + FilterOp::LE => write!(f, "<="), + FilterOp::GE => write!(f, ">="), + FilterOp::EQ => write!(f, "="), + FilterOp::NE => write!(f, "<>"), } } } -impl FromStr for FilterOp { +impl std::str::FromStr for FilterOp { type Err = (); fn from_str(input: &str) -> Result { match input { "<" => Ok(FilterOp::L), - "<=" => Ok(FilterOp::Le), + "<=" => Ok(FilterOp::LE), ">" => Ok(FilterOp::G), - ">=" => Ok(FilterOp::Ge), - "=" => Ok(FilterOp::Eq), - "<>" => Ok(FilterOp::Ne), + ">=" => Ok(FilterOp::GE), + "=" => Ok(FilterOp::EQ), + "<>" => Ok(FilterOp::NE), _ => Err(()), } } @@ -378,15 +251,15 @@ pub enum SortType { Score, } -impl fmt::Display for SortType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for SortType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ConfigType { - CsvStyle(bool), + SamplesCount(usize), Reverse(bool), Samples(bool), Signature(bool), @@ -394,18 +267,19 @@ pub enum ConfigType { } pub struct CliConf { - pub csv_style: ConfigType, + pub samples_count: ConfigType, pub csv_fields: Vec, pub show_samples: ConfigType, pub reverse: ConfigType, pub show_signature: ConfigType, pub show_tokens: ConfigType, } +const DEFAULT_SAMPLES_DISPLAY_COUNT: usize = 30; impl Default for CliConf { fn default() -> Self { CliConf { - csv_style: ConfigType::CsvStyle(false), + samples_count: ConfigType::SamplesCount(DEFAULT_SAMPLES_DISPLAY_COUNT), csv_fields: Vec::new(), show_samples: ConfigType::Samples(true), reverse: ConfigType::Reverse(false), @@ -415,77 +289,35 @@ impl Default for CliConf { } } -// type of label matching objects -#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, FromPrimitive, ToPrimitive)] -pub enum PatternType { - Token = 1, - Signature = 2, // regular expression - Keywords = 3, - White = 4, // whitelist or words for dictionary -} - -impl fmt::Display for PatternType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PatternType::Token => write!(f, "T"), - PatternType::Signature => write!(f, "S"), - PatternType::Keywords => write!(f, "K"), - PatternType::White => write!(f, "W"), +impl CliConf { + fn samples_count(&self) -> usize { + if let ConfigType::SamplesCount(count) = self.samples_count { + count + } else { + DEFAULT_SAMPLES_DISPLAY_COUNT } } -} -pub const ORDERED_PATTERNTYPE: [PatternType; 4] = [ - PatternType::Token, - PatternType::Signature, - PatternType::Keywords, - PatternType::White, -]; - -impl Default for PatternType { - fn default() -> Self { - PatternType::Token + fn is_show_samples_on(&self) -> bool { + self.show_samples == ConfigType::Samples(true) } -} -impl PatternType { - fn weight(self) -> f64 { - match self { - PatternType::Token => 1.0, - PatternType::Signature | PatternType::Keywords | PatternType::White => 10.0, - } + fn is_show_signature_on(&self) -> bool { + self.show_signature == ConfigType::Signature(true) } -} -#[derive(Debug, PartialEq, Eq)] -pub enum Probability { - VeryLow, - Low, - Medium, - High, - VeryHigh, -} - -impl From for Probability { - fn from(f: f64) -> Probability { - match f { - x if x <= 1.0 => Probability::VeryLow, - x if x <= 2.0 => Probability::Low, - x if x <= 10.0 => Probability::Medium, - x if x <= 30.0 => Probability::High, - _ => Probability::VeryHigh, - } + #[must_use] + pub fn is_reverse_on(&self) -> bool { + self.reverse == ConfigType::Reverse(true) } -} -impl fmt::Display for Probability { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Probability::VeryLow => write!(f, "VeryLow"), - Probability::Low => write!(f, "Low"), - Probability::Medium => write!(f, "Medium"), - Probability::High => write!(f, "{}", red!("High")), - Probability::VeryHigh => write!(f, "{}", boldred!("VeryHigh")), + pub fn set(&mut self, x: ConfigType) { + match x { + ConfigType::SamplesCount(_) => self.samples_count = x, + ConfigType::Reverse(_) => self.reverse = x, + ConfigType::Samples(_) => self.show_samples = x, + ConfigType::Signature(_) => self.show_signature = x, + ConfigType::Tokens(_) => self.show_tokens = x, } } } diff --git a/src/bin/cli.rs b/src/main.rs similarity index 80% rename from src/bin/cli.rs rename to src/main.rs index 4a573c8..45c4f64 100644 --- a/src/bin/cli.rs +++ b/src/main.rs @@ -1,43 +1,34 @@ -use labeler::matcher::TitleMatch; -use labeler::{CliConf, ClusterId, ConfigType, FilterOp, FilterType, LabelId, Qualifier}; - use ansi_term::Style; use anyhow::Result; -use log::info; +use labeler::{ + config::Config, matcher::TitleMatch, CliConf, ClusterId, ConfigType, FilterOp, FilterType, + Qualifier, +}; +use log::{error, info}; use rustyline::{config::Configurer, error::ReadlineError}; use rustyline_derive::{Helper, Highlighter, Hinter, Validator}; -use std::collections::LinkedList; -use std::mem; -use std::str::FromStr; +use std::{collections::LinkedList, str::FromStr}; use structopt::StructOpt; #[derive(Debug, StructOpt)] struct Opt { #[structopt(short, long)] - force: bool, - - #[structopt(short, long)] - model: String, - - #[structopt( - short, - long, - default_value = "host=localhost user=postgres password=postgres" - )] - dburl: String, + config_path: String, } fn main() { env_logger::init(); let opt = Opt::from_args(); - if let Err(e) = run(&opt.dburl, opt.force, &opt.model) { - log::error!("{:#}", e); + let cfg = Config::init(&opt.config_path); + + if let Err(e) = run(&cfg) { + error!("{:#}", e); } } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CliCmd { - ClusterId, + ClusterID, Exit, Filter(FilterType, FilterOp), GoNext, @@ -120,10 +111,10 @@ const COMMAND_HISTORY_FILE: &str = ".cli_history.txt"; /// /// Will return `Err` if database connection failed or labeldb_* tables are not exist in database. #[allow(clippy::too_many_lines)] -fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { - let mut champion = TitleMatch::init(dburl, force, model_name)?; - let (mut limit, _) = champion.count_clusters(); - champion.load_matchup_result(dburl)?; +fn run(cfg: &Config) -> Result<()> { + let mut champion = TitleMatch::new(cfg)?; + let mut limit = champion.count_clusters(); + champion.show_statistics(); let mut rl = rustyline::Editor::::new(); let completer = CmdCompleter { @@ -131,16 +122,17 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { }; rl.set_helper(Some(completer)); rl.set_completion_type(rustyline::CompletionType::List); - mem::drop(rl.load_history(COMMAND_HISTORY_FILE)); + let _r = rl.load_history(COMMAND_HISTORY_FILE); let mut prompt: LinkedList<(String, Option, usize)> = LinkedList::new(); let style = Style::new().reverse(); let mut title: String = String::from("Clusters"); + let mut tag: String; let mut ticks: Option = None; - let mut cfg = CliConf::default(); + let mut clicfg = CliConf::default(); loop { - let tag = if ticks.is_none() { + tag = if ticks.is_none() { format!("\n{} [{}]# ", style.paint(&title), limit) } else { format!( @@ -153,7 +145,7 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { let (cmdtype, opt) = get_user_input(&mut rl, &tag); info!("Command: {:?}, option: {:?}", cmdtype, opt); match cmdtype { - CliCmd::ClusterId => { + CliCmd::ClusterID => { if let Some(s) = opt { if let Ok(cid) = s.parse::() { ticks = champion.find_cluster(cid); @@ -187,11 +179,7 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { continue; } CliCmd::GoNext | CliCmd::GoPrev => { - ticks = Some(do_goto( - cmdtype, - ticks, - cfg.reverse == ConfigType::Reverse(true), - )); + ticks = Some(do_goto(cmdtype, ticks, clicfg.is_reverse_on())); } CliCmd::Help => { show_help(); @@ -207,19 +195,13 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { } } CliCmd::QuitProgram => break, - CliCmd::Save(_) => { - /* save qualifiers and labels */ - champion.cli_save(dburl)?; - continue; - } + // CliCmd::Save(_) => { + // /* save qualifiers and labels */ + // // let _ = champion.cli_save(cfg); + // continue; + // } CliCmd::Set(x) => { - match x { - ConfigType::CsvStyle(_) => cfg.csv_style = x, - ConfigType::Reverse(_) => cfg.reverse = x, - ConfigType::Samples(_) => cfg.show_samples = x, - ConfigType::Signature(_) => cfg.show_signature = x, - ConfigType::Tokens(_) => cfg.show_tokens = x, - } + clicfg.set(x); println!("set {:?}\n", x); continue; } @@ -230,8 +212,8 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { } } } - CliCmd::Status => { - champion.print_statistics(); + CliCmd::Save(_) | CliCmd::Status => { + // champion.print_statistics(); continue; } CliCmd::Undefined => { @@ -249,7 +231,7 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { } if let Some(v) = ticks { - champion.print_cluster(v, &cfg); + champion.print_cluster(v, &clicfg); } } @@ -258,7 +240,7 @@ fn run(dburl: &str, force: bool, model_name: &str) -> Result<()> { } fn do_goto(cmd: CliCmd, ticks: Option, reverse: bool) -> usize { - ticks.map_or(0, |v| { + if let Some(v) = ticks { if (cmd == CliCmd::GoNext && !reverse) || (cmd == CliCmd::GoPrev && reverse) { v + 1 } else if v == 0 { @@ -266,7 +248,9 @@ fn do_goto(cmd: CliCmd, ticks: Option, reverse: bool) -> usize { } else { v - 1 } - }) + } else { + 0 + } } fn do_filtering( @@ -277,45 +261,26 @@ fn do_filtering( ) -> Option { let len = match ft { FilterType::Count | FilterType::Qualifier | FilterType::Score => { - pattern.and_then(|s| champion.filter_by(ft, op, s)) - } - #[allow(clippy::option_if_let_else)] // filter_by_label rEquires `&mut TitleMatch` - FilterType::Label => { if let Some(s) = pattern { - if let Ok(v) = s.parse::() { - champion.filter_by_label(ft, op, Some(v)) - } else { - None - } + champion.filter_by(ft, op, s) } else { - champion.filter_by_label(ft, op, None) + None } } - FilterType::Regex => pattern.and_then(|s| champion.filter_by_regex(s)), - /* - FilterType::IPaddr => { - if pattern.starts_with('#') { - let s = pattern.get(1..)?; - champion.filter_by_ipaddr(s, true) + FilterType::Label => { + if let Some(s) = pattern { + champion.filter_by_label(ft, op, Some(s)) } else { - champion.filter_by_ipaddr(pattern, false) + champion.filter_by_label(ft, op, None) } } - FilterType::Status => { - let statuscode = pattern.parse::().ok()?; - champion.filter_by_status(statuscode) - } - FilterType::Time => { - if pattern.starts_with('#') { - let s = pattern.get(1..)?; - let cidx = s.parse::().ok()?; - champion.filter_by_time_range(cidx) + FilterType::Regex => { + if let Some(s) = pattern { + champion.filter_by_regex(s) } else { None } } - FilterType::Token => champion.search_tokens(pattern), - */ _ => None, }; @@ -360,7 +325,7 @@ fn get_user_input(rl: &mut rustyline::Editor, tag: &str) -> (CliCm } else if line.starts_with('#') { if let Some(s) = line.get(1..) { if s.parse::().is_ok() { - return (CliCmd::ClusterId, Some((*s).to_string())); + return (CliCmd::ClusterID, Some((*s).to_string())); } } } @@ -383,24 +348,24 @@ fn get_user_input(rl: &mut rustyline::Editor, tag: &str) -> (CliCm } } } - ["/filter", "label"] => return (CliCmd::Filter(FilterType::Label, FilterOp::Eq), None), + ["/filter", "label"] => return (CliCmd::Filter(FilterType::Label, FilterOp::EQ), None), ["/filter", "label", x] => { return ( - CliCmd::Filter(FilterType::Label, FilterOp::Eq), + CliCmd::Filter(FilterType::Label, FilterOp::EQ), Some((*x).to_string()), ) } ["/filter", "qualifier", x] => { if Qualifier::from_str(x).is_ok() { return ( - CliCmd::Filter(FilterType::Qualifier, FilterOp::Eq), + CliCmd::Filter(FilterType::Qualifier, FilterOp::EQ), Some((*x).to_string()), ); } } ["/filter", "regex", x] => { return ( - CliCmd::Filter(FilterType::Regex, FilterOp::Eq), + CliCmd::Filter(FilterType::Regex, FilterOp::EQ), Some((*x).to_string()), ) } @@ -441,7 +406,6 @@ fn get_user_input(rl: &mut rustyline::Editor, tag: &str) -> (CliCm }; match *x { "benign" => return (CliCmd::SetQualifier(all), Some(String::from("benign"))), - "csvstyle" => return (CliCmd::Set(ConfigType::CsvStyle(op)), None), "mixed" => return (CliCmd::SetQualifier(all), Some(String::from("mixed"))), "reverse" => return (CliCmd::Set(ConfigType::Reverse(op)), None), "samples" => return (CliCmd::Set(ConfigType::Samples(op)), None), diff --git a/src/matcher.rs b/src/matcher.rs index c308923..236d75a 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -1,30 +1,16 @@ use crate::cluster::Clusters; -use crate::database::{AutoDb, Datasource}; -use crate::events::{EventTokens, MessageId}; -use crate::labelmap::TopLabels; -use crate::weblog_parser::{HttpFields, HttpStatus}; -use crate::{ - bold, CliConf, ClusterId, ConfigType, DataType, FilterOp, FilterType, LabelId, PatternId, - PatternType, Probability, QualifiedBy, Qualifier, MAX_QUALIFIERS, ORDERED_PATTERNTYPE, - ORDERED_QUALIFIERS, -}; - -use ansi_term::Style; -use anyhow::{anyhow, Context, Result}; -#[cfg(all(target_arch = "x86_64", feature = "hyperscan"))] -use hyperscan::prelude::*; -use itertools::Itertools; -use num::ToPrimitive; -use num_traits::FromPrimitive; -use regex::Regex; -#[cfg(not(all(target_arch = "x86_64", feature = "hyperscan")))] -use regex::RegexSet; -use std::cmp::Ordering; -use std::collections::{HashMap, HashSet}; -use std::convert::{TryFrom, TryInto}; +use crate::config::Config; +use crate::events::Events; +use crate::labels::Labels; +use crate::tidb::ComplexRules; +use crate::{bold, CliConf, ClusterId, EventType, FilterOp, FilterType, Qualifier, RuleId, TidbId}; +use anyhow::{anyhow, Result}; +use log::info; +use std::convert::TryFrom; use std::fmt; use std::str::FromStr; +/// This structure stores the result of `cli` command `/filter ipaddr/regex/label/...` #[derive(Default)] pub struct FilteredClusters { filtertype: FilterType, @@ -43,786 +29,158 @@ impl fmt::Display for FilteredClusters { } } -#[derive(Default)] -pub struct Candidates { - by: QualifiedBy, - clusters: HashMap>, -} - -impl Candidates { - fn add(&mut self, cidx: ClusterId, q: Qualifier) { - if let Some(cv) = self.clusters.get_mut(&q) { - cv.push(cidx); - } else { - self.clusters.insert(q, vec![cidx]); - } - } - - fn remove(&mut self, cidx: ClusterId, q: Qualifier) { - if let Some(cv) = self.clusters.get_mut(&q) { - cv.retain(|i| *i != cidx); - } - } - - #[must_use] - pub fn qualified_count(&self) -> usize { - if self.by == QualifiedBy::Label { - self.clusters.iter().fold(0, |sum, (_, v)| sum + v.len()) - } else { - self.clusters - .iter() - .filter(|(q, _)| **q != Qualifier::Unknown) - .fold(0, |sum, (_, v)| sum + v.len()) - } - } - - #[must_use] - fn unknown_clusters(&self) -> usize { - self.clusters.get(&Qualifier::Unknown).map_or(0, Vec::len) - } - - fn included_both(&mut self, q: Qualifier, other: &[ClusterId]) -> Vec { - self.clusters.get(&q).map_or_else(Vec::new, |cidx_list| { - cidx_list - .iter() - .filter_map(|cidx| { - if other.contains(cidx) { - Some(*cidx) - } else { - None - } - }) - .collect() - }) - } - - pub fn unknown_to_suspicious(&mut self, other: &Self) -> usize { - if let Some(suspicious_clusters) = other.clusters.get(&Qualifier::Suspicious) { - let both = self.included_both(Qualifier::Unknown, suspicious_clusters); - if !both.is_empty() { - for cidx in &both { - self.remove(*cidx, Qualifier::Unknown); - self.add(*cidx, Qualifier::Suspicious); - } - } - } - self.qualified_count() - } -} - -impl fmt::Display for Candidates { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (q, cv) in &self.clusters { - writeln!(f, "{}: {} clusters", q, cv.len())?; - writeln!(f, "\t{:?}", cv)?; - } - Ok(()) - } -} - -#[derive(Default)] -pub struct MatchUp { - qualifier: Qualifier, - matched: HashMap)>, -} - -// f64 => score -// Vec => token/keywords/signature list -type MatchedLabels = (LabelId, f64, Vec); - -impl MatchUp { - #[must_use] - pub fn new( - qualifier: Qualifier, - lab_id: LabelId, - score: f64, - pid_list: Vec, - ) -> Self { - let mut matched: HashMap)> = HashMap::new(); - matched.insert(lab_id, (score, pid_list)); - MatchUp { qualifier, matched } - } - - #[must_use] - pub fn fields(&self) -> Vec<(LabelId, (f64, &[PatternId]))> { - self.matched - .iter() - .map(|(lab_id, (score, pid_list))| (*lab_id, (*score, pid_list.as_ref()))) - .collect() - } - - #[must_use] - pub fn qualifier(&self) -> Qualifier { - self.qualifier - } - - pub fn append_matched(&mut self, lab_id: LabelId, pids: Vec, score: f64) { - if let Some((s, pid_list)) = self.matched.get_mut(&lab_id) { - *s = score; - pid_list.extend(pids); - } else { - self.matched.insert(lab_id, (score, pids)); - } - } - - #[must_use] - pub fn matched_labels(&self, toplabs: &TopLabels, flag: Option) -> Vec { - // make matched labels list including score and patterns - let mut rst: Vec = Vec::new(); - self.matched.iter().for_each(|(lab_id, (score, pid_list))| { - if let Some(f) = flag { - let v = toplabs.filter_by_useful(*lab_id, pid_list, f); - if !v.is_empty() { - rst.push((*lab_id, *score, v)); - } - } else { - rst.push((*lab_id, *score, pid_list.clone())); - } - }); - - // sort by sore in reverse order. Highest scores first - rst.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal)); - rst - } - - #[must_use] - pub fn have_useful_match(&self, lab_id: Option, toplabs: &TopLabels) -> Option { - if let Some(lid) = lab_id { - if let Some((_, pid_list)) = self.matched.get(&lid) { - let v = toplabs.filter_by_useful(lid, pid_list, true); - if !v.is_empty() { - return Some(v.len()); - } - } - } else { - let cnt = self.matched.iter().fold(0, |sum, (lab_id, (_, pid_list))| { - sum + toplabs.filter_by_useful(*lab_id, pid_list, true).len() - }); - if cnt > 0 { - return Some(cnt); - } - } - None - } - - pub fn calculate_score(&mut self, toplabs: &TopLabels) { - self.matched - .iter_mut() - .for_each(|(lab_id, (score, pid_list))| { - *score = toplabs.calculate_score(*lab_id, pid_list); - }); - } -} - -pub type ClusterMatchUp = HashMap; - -#[derive(Default)] pub struct TitleMatch { - model: Datasource, clusters: Clusters, - events: EventTokens, - labels: TopLabels, - layers: Vec, // for labeling - rounds: Vec, // for cli use - //pub ip_clusters_map: HashMap, - matchup_result: ClusterMatchUp, + events: Events, + tidbs: Vec, + labels: Labels, + rounds: Vec, } impl TitleMatch { /// # Errors /// /// Will return `Err` if it fails to connect postgres db or datasource not found - pub fn init(dburl: &str, force: bool, model_name: &str) -> Result { - let mut db = AutoDb::new(dburl)?; - let mut model = db.get_datasource(model_name)?; - let (model_id, _, model_logtype, _) = model.fields(); - log::info!( - "processing model #{} \"{}\", {:?}", - model_id, - model_name, - model_logtype - ); - - match model_logtype { - DataType::Csv | DataType::Packet | DataType::Unknown => { - return Err(anyhow!("unsupported log type {:?}", model_logtype)) - } - DataType::Log(_) => {} + pub fn new(cfg: &Config) -> Result { + if EventType::Packet == cfg.event_type() { + return Err(anyhow!("unsupported log type {:?}", cfg.event_type())); } - // load clusters - let mut clusters = Clusters::default(); - clusters.load_clusters(&mut db, model_id)?; - log::info!( - "model #{}: {} clusters are loaded.", - model_id, - clusters.len() - ); - if force { - clusters.clear_qualifiers(); - } + info!("loading labels"); + let labels = Labels::new(cfg.labels())?; - db.initialize_autodb()?; + info!("loading clusters"); + let mut clusters = Clusters::new(cfg.clusters(), &labels)?; + if clusters.is_empty() { + return Err(anyhow!("clusters not found.")); + } + info!("{} clusters are loaded.", clusters.len()); - // load events, tokenize, and save - let mut events = EventTokens::default(); - // fields of weblog to extract tokens - let token_fields = vec![HttpFields::Request as usize, HttpFields::Referer as usize]; - events.load_events_and_tokenize(&mut db, &mut model, force, &token_fields)?; - events.make_events_ipaddr_map(); - events.make_events_httpstatus_map(); + info!("loading events"); + let events = Events::new(cfg, clusters.event_ids())?; if events.is_empty() { return Err(anyhow!("events not found.")); } + info!("{} events are loaded.", events.len()); - // make tokens-clusters relationship map - clusters.make_tokens_clusters_map(&events); - // make clusters-ipaddr relationship map - clusters.make_clusters_ipaddr_map(&events); - // collect httpstatus for each clsuters - clusters.get_cluster_httpstatus(&events); - db.initialize_labeldb()?; + clusters.init_event_tokens(&events); - // load labels - let mut labels = TopLabels::default(); - let label_count = labels.load_labels(&mut db)?; - log::info!("{} labels are loaded.", label_count); - labels.load_dict(&mut db)?; + info!("loading tidb"); + let tidbs = ComplexRules::new(cfg.tidb())?; + // init base(bottom filter) layer let rounds: Vec = vec![FilteredClusters { filtertype: FilterType::default(), op: FilterOp::default(), pattern: String::from("Clusters"), clusters: clusters.cluster_list().clone(), }]; + Ok(TitleMatch { - model, clusters, events, + tidbs, labels, - layers: Vec::new(), rounds, - matchup_result: HashMap::new(), }) } - /// # Errors - /// - /// Will return `Err` if it fails to query `labeldb_match_result`. - pub fn load_matchup_result(&mut self, dburl: &str) -> Result { - let mut db = AutoDb::new(dburl)?; - self.matchup_result = db.get_match_result(self.model.model_id())?; - self.calculate_label_probability(); - log::info!( - "model #{}: {} matched clusters.", - self.model.model_id(), - self.matchup_result.len() + pub fn show_statistics(&self) { + let (labeled_clusters, labeled_events, representative_labels) = self.labels.statistics(); + println!( + "{:>6} clusters\n{:>6} labeled clusters\n{:>6} labeled events\n{:>6} representatives", + self.clusters.len(), + labeled_clusters, + labeled_events, + representative_labels ); - Ok(self.matchup_result.len()) } #[must_use] - pub fn merge_all_qualified(&self) -> ClusterMatchUp { - // merge all qualified results - - let mut merged: ClusterMatchUp = HashMap::new(); - // Step 1: merge all candidates by it's qualifier, except unkowns. - // For unknowns, take the last layer's unknown clusters - self.layers - .iter() - .filter(|c| c.by != QualifiedBy::Label) - .for_each(|src| { - // copy candidates to merged - src.clusters.iter().for_each(|(q, vec_cluster)| { - for cid in vec_cluster.iter() { - if let Some(mu) = merged.get_mut(cid) { - // Only unknown clusters can be found multiple times. - // except label-matched layer. - // The cluster found later is latest qualified result. - // update previous qualifier with the latest - if mu.qualifier == Qualifier::Unknown { - mu.qualifier = *q; - } - } else { - let mu = MatchUp { - qualifier: *q, - ..MatchUp::default() - }; - merged.insert(*cid, mu); - } - } - }); - }); - - // Step 2: copy the merged cluster's qualifier to labeled clusters - // (discarded idea) (X) if there are label matched clusters in the merged unknowns, move it to suspicious. - self.matchup_result.iter().for_each(|(cid, mu)| { - if !mu.matched.is_empty() { - // copy label match result to merged - if let Some(merged_mu) = merged.get_mut(cid) { - merged_mu.matched = mu.matched.clone(); - } else { - // for DEBUG - // The merged of all layers should contain all clusters. - log::error!( - "model #{}: {} - misplaced labeled cluster.", - self.model.model_id(), - cid - ); - } - } - }); - - merged - } - - /// # Errors - /// - /// Will return `Err` if it fails to connect postgres or `labeldb_match` table does not exist! - pub fn labeler_save( - &mut self, - dburl: &str, - merged: &ClusterMatchUp, - tokens_usage: &HashMap<(PatternType, String), Vec>, - force: bool, - ) -> Result<()> { - if merged.is_empty() { - return Ok(()); - } - - let mut db = AutoDb::new(dburl)?; - let model_id = self.model.model_id(); - // the force option has no effect on saving match results. Overwrite the match result always. - let (mut cnt, q) = db.save_match_result(model_id, merged)?; - log::info!("model #{}: {} match records are saved.", model_id, cnt); - log::info!("model #{}: {} qualifiers are updated.", model_id, q); - - // save tokens if new or updated tokens are exists - cnt = self.events.save(&mut db, model_id, force)?; - if cnt == 0 { - log::info!("model #{}: 0 new event tokens.", model_id); - } else { - log::info!("model #{}: {} event tokens are saved.", model_id, cnt); - } - - // save relocated label patterns, dictionary - log::info!("model #{}: save updated labels and dictionary.", model_id); - self.labels.save_and_update_labels(&mut db, force)?; - self.labels - .save_and_reload_dictionary(&mut db, true, false)?; - - log::info!( - "model #{}: saving label patterns usage statistics...", - model_id - ); - cnt = db.save_labeldb_tokens_usage(model_id, tokens_usage)?; - log::info!( - "model #{}: {} patterns usage records are saved.", - model_id, - cnt - ); - Ok(()) - } - - /// # Errors - /// - /// Will return `Err` if it failed to update cluster qualifier. - pub fn cli_save(&mut self, dburl: &str) -> Result<()> { - // save qualifiers and labels - // TODO: set and save cluster labels - let mut db = AutoDb::new(dburl)?; - let id = self.model.model_id(); - let c = self.clusters.save_cluster_qualifier(&mut db, id)?; - log::info!("model #{}: {} clusters are updated.", id, c); - Ok(()) - } - - pub fn add_layer(&mut self, candidates: Candidates) { - self.layers.push(candidates); - } - - #[must_use] - pub fn layers_depth(&self) -> usize { - self.layers.len() - } - - #[must_use] - pub fn count_clusters(&self) -> (usize, usize) { - let qualified = self - .layers - .iter() - .fold(0, |sum, l| sum + l.qualified_count()); - (self.clusters.len(), qualified) - } - - #[must_use] - pub fn unknown_clusters(&self) -> usize { - self.layers - .last() - .map_or_else(|| self.clusters.len(), Candidates::unknown_clusters) - } - - fn print_qualified_count(cnt: &[usize; MAX_QUALIFIERS]) { - cnt.iter().enumerate().for_each(|(i, v)| { - if i != 0 { - print!(", "); - } - print!( - "{} {}", - Qualifier::from_i32((i + 1).try_into().unwrap_or(0)).unwrap_or_default(), - v - ); - }); - println!(); - } - - pub fn print_statistics(&self) { - println!("{}, {} clusters", self.model, self.clusters.len()); - println!(); - self.rounds.iter().enumerate().for_each(|(i, r)| { - print!("layer {}{}", i, r); - let cnt = self.clusters.cluster_statistics(&r.clusters); - Self::print_qualified_count(&cnt); - }); - - if !self.matchup_result.is_empty() { - println!("\nLabeled"); - self.rounds.iter().enumerate().for_each(|(ridx, r)| { - print!("layer {}{}", ridx, r); - let mut cnt = [0_usize; MAX_QUALIFIERS]; - r.clusters.iter().for_each(|cid| { - if let Some(mu) = self.matchup_result.get(cid) { - if mu.have_useful_match(None, &self.labels).is_some() { - let idx = mu.qualifier.to_u32().unwrap_or(0) as usize; - cnt[idx - 1] += 1; - } - } - }); - Self::print_qualified_count(&cnt); - }); - } + pub fn count_clusters(&self) -> usize { + self.clusters.len() } pub fn print_cluster(&self, idx: usize, cfg: &CliConf) { if let Some(last) = self.rounds.last() { - if idx < last.clusters.len() { - print!("[{}]", idx); - let cid = last.clusters[idx]; - self.clusters.print(cid, &self.events, cfg); - - if let Some(mu) = self.matchup_result.get(&cid) { - let matched = mu.matched_labels(&self.labels, Some(true)); - - if !matched.is_empty() { - println!("\n{}", bold!("Detected label(s):")); - for (lab_id, score, pid_list) in &matched { - if let Some(name) = self.labels.label_name(*lab_id) { - println!( - "[{}] ({}) #{} {}", - Probability::from(*score), - score, - lab_id, - name - ); - if cfg.show_tokens == ConfigType::Tokens(true) { - let patterns = - self.labels.types_and_patterns(*lab_id, pid_list); - for pt in &ORDERED_PATTERNTYPE { - let s: Vec<_> = patterns - .iter() - .filter_map(|(tt, pattern)| { - if *tt == *pt { - Some(pattern) - } else { - None - } - }) - .collect(); - if !s.is_empty() { - println!("\t{} - {}", pt, s.iter().join(", ")); - } - } - } - } - } - } - } - } - } - } - - #[must_use] - pub fn find_cluster(&self, cid: ClusterId) -> Option { - self.rounds - .last() - .and_then(|last| last.clusters.iter().position(|i| *i == cid)) - } - - pub fn qualify_unknowns_by_httpstatus(&mut self) -> Candidates { - let mut candidates = Candidates::default(); - let status = self.clusters.get_cluster_httpstatus(&self.events); - for (cid, codes) in &status { - let mut q: Vec = codes - .iter() - .map(|(st, _)| match *st { - HttpStatus::Informational | HttpStatus::Ok | HttpStatus::Redirection => { - Qualifier::Benign - } - HttpStatus::ClientError | HttpStatus::ServerError => Qualifier::Suspicious, - HttpStatus::Unknown => Qualifier::Unknown, - }) - .collect(); - q.sort(); - q.dedup(); - let final_qualifier = if q.len() == 1 { - q.first().map_or(Qualifier::Unknown, |first| *first) - } else if q.contains(&Qualifier::Benign) && q.contains(&Qualifier::Suspicious) { - Qualifier::Mixed - } else { - Qualifier::Unknown - }; - log::debug!("Status: #{} {} {:?}", cid, final_qualifier, codes); - candidates.add(*cid, final_qualifier); - } - - candidates.by = QualifiedBy::HttpStatus; - candidates - } - - pub fn auto_enable_disable_patterns( - &mut self, - usage: &HashMap<(PatternType, String), Vec>, - ) -> usize { - let mut pos_benign = 0_usize; - let mut pos_suspicious = 0_usize; - ORDERED_QUALIFIERS.iter().enumerate().for_each(|(i, q)| { - if *q == Qualifier::Benign { - pos_benign = i; - } else if *q == Qualifier::Suspicious { - pos_suspicious = i; - } - }); - - // relocate to benign-side: - // Benign > Suspicious && (Suspicious == 0 || Benign/2 > Suspicious) - // relocate to suspicious-side: - // Benign < suspicous && (Benign == 0 || Suspicious/3 > Benign) - // relocate to mixed: (Dictionary) - // Benign/2 <= Suspicious || Suspicious/3 <= Benign (Temp Disable) - // relocate to unknown: - // Benign == 0 && Suspicious == 0 - let mut cnt = 0_usize; - for ((tt, pattern), qcnt) in usage.iter() { - if (qcnt[pos_suspicious] < qcnt[pos_benign]) - && (qcnt[pos_suspicious] == 0 || qcnt[pos_suspicious] < qcnt[pos_benign] / 2) - { - if let Some((ptn_cnt, mod_cnt)) = self.labels.relocate(*tt, pattern, false) { - if mod_cnt > 0 { - cnt += ptn_cnt; - log::info!("relocate {} \"{}\"\tfalse (benign)", tt, pattern); - } - } - } else if qcnt[pos_suspicious] > qcnt[pos_benign] - && (qcnt[pos_benign] == 0 || qcnt[pos_suspicious] / 3 > qcnt[pos_benign]) - { - if let Some((ptn_cnt, mod_cnt)) = self.labels.relocate(*tt, pattern, true) { - if mod_cnt > 0 { - cnt += ptn_cnt; - log::info!("relocate {} \"{}\"\ttrue (suspicious)", tt, pattern); - } - } - } else if qcnt[pos_benign] / 2 <= qcnt[pos_suspicious] - || qcnt[pos_suspicious] / 3 <= qcnt[pos_benign] - { - if let Some((ptn_cnt, mod_cnt)) = self.labels.relocate(*tt, pattern, false) { - if mod_cnt > 0 { - cnt += ptn_cnt; - log::info!("relocate {} \"{}\"\tfalse (mixed)", tt, pattern); - } - } + if idx >= last.clusters.len() { + return; } - } - - cnt - } - - /// matching clusters with label - pub fn clusters_vs_labels(&mut self) -> Candidates { - let mut candidates = Candidates { - by: QualifiedBy::Label, - ..Candidates::default() - }; - - // step 1: matchup - cluster token vs label token (only PatternType::Token) - self.matchup_result = self.clusters.cluster_vs_labels_by_tokens(&self.labels); - log::info!( - "{} clusters are matched by label tokens.", - self.matchup_result.len() - ); - - // TODO: - // 1. All labels are suspcious? - // No. There could be labels for normal(benign) messages - // 2. All label matches are valid? or effective? - // No. ... - for cid in self.matchup_result.keys() { - candidates.add(*cid, Qualifier::Suspicious); - } - - // step 2: event contents of cluster vs label PatternType::Keywords, PatternType::Signature - match self.clusters_vs_labels_by_patterns() { - Ok(x) => { - // TODO: the number of matched events will not affect to scores. is it right? - // convert message id to cluster id, and label id/pattern id data type - let mut matched_clusters = HashSet::new(); - for (msgid, lab_id_u32, pid_u32) in &x { - if let Some(cid) = self.clusters.find_cluster(*msgid) { - if let Ok(pid) = PatternId::try_from(*pid_u32) { - if let Ok(lab_id) = LabelId::try_from(*lab_id_u32) { - matched_clusters.insert((*cid, lab_id, pid)); - } + let cid = last.clusters[idx]; + print!("[{}]", idx); + self.clusters.print(cid, &self.events, cfg); + + let cluster_size = u32::try_from(self.clusters.size(cid)).unwrap_or_default(); + if let Some(matched) = self.labels.get_representative_labels(cid) { + println!("\n{}", bold!("Cluster label(s):")); + for (tidb_id, rule_id, count, score) in matched { + if let Some(name) = Self::get_label_name(self, *tidb_id, *rule_id) { + let score = f64::try_from(*score).unwrap_or_default(); + let dividend = f64::try_from(cluster_size).unwrap_or_default(); + if dividend > 0.0 { + println!( + "{:.03} {}/{} {}:{} {}", + score / dividend, + count, + cluster_size, + tidb_id, + rule_id, + name + ); } } } + } - let mut compressed_matches: HashMap<(ClusterId, LabelId), Vec> = - HashMap::new(); - for (cid, lab_id, pid) in &matched_clusters { - if let Some(pid_list) = compressed_matches.get_mut(&(*cid, *lab_id)) { - pid_list.push(*pid); + if let Some(matched) = self.labels.get_event_labels(cid) { + println!("\n{}", bold!("Event label(s):")); + let mut unknowns = Vec::new(); + for ((tidb_id, rule_id), count) in matched { + if let Some(name) = Self::get_label_name(self, tidb_id, rule_id) { + println!("{:>4} {}:{} {}", count, tidb_id, rule_id, name); } else { - compressed_matches.insert((*cid, *lab_id), vec![*pid]); + unknowns.push(tidb_id); } } - let mut cnts = HashSet::new(); - for ((cid, lab_id), pid_list) in &compressed_matches { - cnts.insert(*cid); - if let Some(mu) = self.matchup_result.get_mut(cid) { - mu.append_matched(*lab_id, pid_list.clone(), 0.0); + unknowns.sort_unstable(); + unknowns.dedup(); + for tidb_id in unknowns { + if let Some(name) = Self::get_tidb_name(self, tidb_id) { + println!("{} {}", tidb_id, name); } else { - let mut mu = MatchUp::default(); - mu.append_matched(*lab_id, pid_list.clone(), 0.0); - self.matchup_result.insert(*cid, mu); + println!("{:>8}:", tidb_id); } } - - log::info!( - "{}/{} messages/clusters are matched by label keywords and signatures.", - x.len(), - cnts.len() - ); } - Err(e) => log::error!("events matching failed when {:?}", e), } - - candidates } - /// # Errors - /// - /// Will return `Err` if label have invalid keywords or signatures - /// like abnormal regular expression syntax. - #[cfg(all(target_arch = "x86_64", feature = "hyperscan"))] - fn clusters_vs_labels_by_patterns(&self) -> Result> { - // result: vector of (message id, label id, keywords or signature index) - let signatures = self.labels.only_regex_patterns(); - let patterns = - Patterns::from_str(&signatures.join("\n")).with_context(|| "make patterns")?; - - // DEBUG message - log::info!("{} signatures are generated.", patterns.len()); - patterns.iter().for_each(|p| { - if let Some(id) = p.id { - log::info!("{:>8}{:>3}: {}", id >> 8, id & 0xff, p.expression); + fn get_tidb_name(&self, tidb_id: TidbId) -> Option<&str> { + for tidb in &self.tidbs { + if tidb.id() == tidb_id { + return Some(tidb.name()); } - }); - - let db: BlockDatabase = patterns.build().with_context(|| "compile patterns")?; - let scratch = db - .alloc_scratch() - .with_context(|| "allocate scratch space")?; - - self.events.regex_match_hyperscan(&db, &scratch) + } + None } - #[cfg(not(all(target_arch = "x86_64", feature = "hyperscan")))] - fn clusters_vs_labels_by_patterns(&self) -> Result> { - // result: vector of (message id, label id, keywords or signature index) - let signatures = self.labels.only_regex_patterns(); - let mut pattern_ids: Vec> = vec![]; - let patterns: Vec<&str> = signatures - .iter() - .flat_map(|sigs| sigs.lines()) - .filter_map(|line| { - let line = line.trim(); - - if line.is_empty() || line.starts_with('#') { - None - } else { - let expr = if let Some(off) = line.find(":/") { - pattern_ids.push(Some( - line[..off].parse().with_context(|| "make patterns").ok()?, - )); - &line[off + 1..] - } else { - pattern_ids.push(None); - line - }; - let rule = match (expr.starts_with('/'), expr.rfind('/')) { - (true, Some(end)) if end > 0 => &expr[1..end], - _ => expr, - }; - - Some(rule) - } - }) - .collect(); - - // DEBUG message - log::info!("{} signatures are generated.", patterns.len()); - pattern_ids.iter().enumerate().for_each(|(idx, id)| { - if let Some(id) = id { - log::info!("{:>8}{:>3}: {}", id >> 8, id & 0xff, patterns[idx]); + fn get_label_name(&self, tidb_id: TidbId, rule_id: RuleId) -> Option<&str> { + for tidb in &self.tidbs { + let r = tidb.get_label_name(tidb_id, rule_id); + if r.is_some() { + return r; } - }); - - let db: RegexSet = RegexSet::new(patterns).with_context(|| "compile patterns")?; - Ok(self.events.regex_set_match(&db)) + } + None } - pub fn filter_by_label( - &mut self, - ft: FilterType, - op: FilterOp, - lab_id: Option, - ) -> Option { - // Filter clusters with label. - // if label_id is none, then all labels. - let last = &self.rounds.last()?.clusters; - let clusters: Vec = self - .matchup_result - .iter() - .filter_map(|(cid, mu)| { - if last.contains(cid) && mu.have_useful_match(lab_id, &self.labels).is_some() { - return Some(*cid); - } - None - }) - .collect(); - if clusters.is_empty() { - None + #[must_use] + pub fn find_cluster(&self, cid: ClusterId) -> Option { + if let Some(last) = self.rounds.last() { + last.clusters.iter().position(|i| *i == cid) } else { - let cnt = clusters.len(); - let pattern = lab_id.map_or_else(|| String::from("All"), |v| v.to_string()); - - self.rounds.push(FilteredClusters { - filtertype: ft, - op, - pattern, - clusters, - }); - Some(cnt) + None } } @@ -830,7 +188,7 @@ impl TitleMatch { let clusters = self .clusters .filter_clusters(&self.rounds.last()?.clusters, ft, op, value); - log::info!( + info!( "filtering by \"{:?} {} {}\". {} clusters", ft, op, @@ -856,12 +214,46 @@ impl TitleMatch { } } + /// Filter clusters with label. + /// if `pattern_id` is none, then all labels. + /// + /// Return the number of filtered clusters + pub fn filter_by_label( + &mut self, + ft: FilterType, + op: FilterOp, + pattern_id: Option<&str>, + ) -> Option { + let (tidb_id, rule_id) = parse_pattern_id(pattern_id); + let last = &self.rounds.last()?.clusters; + let mut found = self.labels.find_clusters(tidb_id, rule_id); + found.retain(|cluster_id| last.contains(cluster_id)); + if found.is_empty() { + None + } else { + let cnt = found.len(); + let pattern = if let Some(v) = pattern_id { + v.to_string() + } else { + String::from("All") + }; + + self.rounds.push(FilteredClusters { + filtertype: ft, + op, + pattern, + clusters: found, + }); + Some(cnt) + } + } + pub fn filter_by_regex(&mut self, pattern: &str) -> Option { let last = self.rounds.last()?; /* ! => negation (trick!!!) */ let mut negate: bool = false; - let regex_p = if pattern.starts_with('!') { + let pattern = if pattern.starts_with('!') { if pattern.len() == 1 { return None; } @@ -871,35 +263,37 @@ impl TitleMatch { pattern }; - let s = match Regex::new(regex_p) { - Ok(s) => s, + match self + .clusters + .regex_match(&last.clusters, pattern, &self.events) + { + Ok(mut clusters) => { + if negate { + clusters = last + .clusters + .iter() + .filter(|cid| !clusters.contains(cid)) + .copied() + .collect(); + } + + if clusters.is_empty() { + None + } else { + let cnt = clusters.len(); + self.rounds.push(FilteredClusters { + filtertype: FilterType::Regex, + op: FilterOp::EQ, + pattern: pattern.to_string(), + clusters, + }); + Some(cnt) + } + } Err(e) => { - println!("Error: {:?}", e); - return None; + eprintln!("Error: {}", e); + None } - }; - - let mut clusters = self.clusters.regex_match(&last.clusters, &s, &self.events); - if negate { - clusters = last - .clusters - .iter() - .filter(|cid| !clusters.contains(cid)) - .copied() - .collect(); - } - - if clusters.is_empty() { - None - } else { - let cnt = clusters.len(); - self.rounds.push(FilteredClusters { - filtertype: FilterType::Regex, - op: FilterOp::Eq, - pattern: pattern.to_string(), - clusters, - }); - Some(cnt) } } @@ -908,51 +302,13 @@ impl TitleMatch { /// Will return `Err` if a try to remove on an empty filter pub fn remove_filter(&mut self) -> Result<()> { if self.rounds.len() > 1 { - self.rounds.pop(); + let _r = self.rounds.pop(); Ok(()) } else { Err(anyhow!("Failed to remove the filtered clusters.")) } } - #[must_use] - pub fn collect_patterns_usage( - &self, - merged: &ClusterMatchUp, - ) -> HashMap<(PatternType, String), Vec> { - let mut usage: HashMap<(PatternType, String), Vec> = HashMap::new(); - merged - .iter() - .filter(|(_, mu)| !mu.matched.is_empty()) - .for_each(|(_, mu)| { - let mut cx_usage: Vec<(PatternType, String)> = Vec::new(); - mu.matched_labels(&self.labels, None) - .iter() - .for_each(|(lab_id, _, pid_list)| { - cx_usage.extend(self.labels.types_and_patterns(*lab_id, pid_list)); - }); - - for pt in &cx_usage { - if let Some(idx) = ORDERED_QUALIFIERS.iter().position(|q| *q == mu.qualifier) { - if let Some(q_cnts) = usage.get_mut(pt) { - q_cnts[idx] += 1; - } else { - let mut q_cnts: Vec = vec![0; MAX_QUALIFIERS]; - q_cnts[idx] = 1; - usage.insert(pt.clone(), q_cnts); - } - } - } - }); - usage - } - - pub fn calculate_label_probability(&mut self) { - for mu in self.matchup_result.values_mut() { - mu.calculate_score(&self.labels); - } - } - pub fn set_qualifier(&mut self, idx: usize, qualifier: &str, all: bool) -> Option { let last = self.rounds.last()?; @@ -980,61 +336,23 @@ impl TitleMatch { } Some(cnt) } +} - // expand suspicious, benign group to unknown with ipaddr - // TODO: if just part of ipaddr are matched to suspicious or benign, ... - pub fn extend_qualification_by_httpipaddr(&mut self) -> Candidates { - let mut candidates = Candidates::default(); - if let Some(last) = self.layers.last() { - if let Some(unknowns) = last.clusters.get(&Qualifier::Unknown) { - let suspicious = last - .clusters - .get(&Qualifier::Suspicious) - .map_or_else(HashSet::new, |clusters| { - self.clusters.get_clusters_share_ipaddr(clusters, unknowns) - }); - let benign = last - .clusters - .get(&Qualifier::Benign) - .map_or_else(HashSet::new, |clusters| { - self.clusters.get_clusters_share_ipaddr(clusters, unknowns) - }); - - let new_unknowns: Vec = unknowns - .iter() - .filter_map(|cid| { - if suspicious.contains(cid) || benign.contains(cid) { - None - } else { - Some(*cid) - } - }) - .collect(); - - let mixed: HashSet = suspicious.intersection(&benign).copied().collect(); - candidates.by = QualifiedBy::IPaddr; - candidates.clusters.insert( - Qualifier::Mixed, - mixed.iter().copied().collect::>(), - ); - candidates.clusters.insert( - Qualifier::Suspicious, - suspicious - .difference(&mixed) - .copied() - .collect::>(), - ); - candidates.clusters.insert( - Qualifier::Benign, - benign - .difference(&mixed) - .copied() - .collect::>(), - ); - candidates.clusters.insert(Qualifier::Unknown, new_unknowns); +fn parse_pattern_id(pattern_id: Option<&str>) -> (u32, u32) { + let mut tidb_id: TidbId = 0; + let mut rule_id: RuleId = 0; + if let Some(id) = pattern_id { + let s: Vec<_> = id.split(':').collect(); + if let Some(v) = s.first() { + if let Ok(vv) = v.parse::() { + tidb_id = vv; + } + } + if let Some(v) = s.get(1) { + if let Ok(vv) = v.parse::() { + rule_id = vv; } } - - candidates } + (tidb_id, rule_id) } diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..8896126 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,155 @@ +use percent_encoding::percent_decode_str; + +const OPTION_URL_DECODE: bool = false; +const OPTION_EXCLUDE_NUMERIC: bool = true; +const OPTION_REMOVE_DUPLICATES: bool = false; +// const OPTION_REMOVE_URL_ENCODE: bool = true; +const OPTION_TO_LOWERCASE: bool = true; +const OPTION_TOKEN_MIN_LENGTH: usize = 3; +const OPTION_REMOVE_HEXCODE: bool = true; +const OPTION_HEXCODE_MIN_LENGTH: usize = 20; +const OPTION_REMOVE_DOT_DIGIT: bool = true; + +// characters treated as token +const TOKEN_CHARS: [char; 4] = ['.', '_', '-', '@']; + +/* +const TOKEN_DELIMITERS: [char; 28] = [ + '/', '?', '&', '=', '[', ']', '\'', ';', ')', '(', '+', '*', ',', '<', '>', '\\', '\"', '|', + '~', '{', '}', ':', '\t', '#', '!', ' ', '`', '$', +]; +*/ + +#[must_use] +pub fn extract_tokens(s: &str) -> Vec { + let mut pairs: Vec<(usize, usize)> = Vec::new(); + let mut begin: usize; + let mut end: usize; + let mut eof: bool = false; + + let mut chs = s.char_indices(); + loop { + begin = 0; + end = 0; + + loop { + if let Some((idx, c)) = chs.next() { + if c.is_alphanumeric() || TOKEN_CHARS.contains(&c) { + begin = idx; + break; + } + continue; + } + eof = true; + break; + } + + if !eof { + loop { + if let Some((idx, c)) = chs.next() { + end = idx; + if c.is_alphanumeric() || TOKEN_CHARS.contains(&c) { + continue; + } + break; + } + eof = true; + break; + } + } + + if begin < end { + if eof { + pairs.push((begin, end + 1)); + } else { + pairs.push((begin, end)); + } + } /* else if s.len() > start { + pairs.push((start, s.len())); + }*/ + + if eof { + break; + } + } + + let mut v: Vec = Vec::new(); + for (x, y) in &pairs { + if let Some(s) = s.get(*x..*y) { + let mut token = s.trim().to_string(); + if OPTION_URL_DECODE && token.contains('%') { + token = percent_decode_str(&token).decode_utf8_lossy().to_string(); + } + + if OPTION_EXCLUDE_NUMERIC && check_numeric(s) { + continue; + } + + if OPTION_TO_LOWERCASE { + token = token.to_lowercase(); + } + + if OPTION_REMOVE_DUPLICATES && v.contains(&token) { + continue; + } + + if token.len() < OPTION_TOKEN_MIN_LENGTH { + continue; + } + + if OPTION_REMOVE_HEXCODE && check_hexdigit(s) && (*y - *x) >= OPTION_HEXCODE_MIN_LENGTH + { + continue; + } + + if OPTION_REMOVE_DOT_DIGIT && check_dotdigit(s) { + continue; + } + + // TODO: + // - remove leading and trailing dot(.) + + v.push(token); + } + } + v +} + +fn check_numeric(x: &str) -> bool { + let mut ch = x.chars(); + loop { + if let Some(c) = ch.next() { + if c.is_numeric() { + continue; + } + return false; + } + return true; + } +} + +fn check_hexdigit(x: &str) -> bool { + let mut ch = x.chars(); + loop { + if let Some(c) = ch.next() { + if c.is_ascii_hexdigit() { + continue; + } + return false; + } + return true; + } +} + +fn check_dotdigit(x: &str) -> bool { + let mut ch = x.chars(); + loop { + if let Some(c) = ch.next() { + if c.is_numeric() || c == '.' { + continue; + } + return false; + } + return true; + } +} diff --git a/src/threat_description.rs b/src/threat_description.rs deleted file mode 100644 index 864df94..0000000 --- a/src/threat_description.rs +++ /dev/null @@ -1,242 +0,0 @@ -use crate::labels::{Label, Pattern}; -use crate::weblog_parser::{token_occurrences, tokenize_normal_log}; -use crate::{ubold, Modified, PatternType}; - -use ansi_term::Style; -use anyhow::{anyhow, Result}; -use glob::glob; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::{fs::File, io::BufReader, path::Path}; - -pub trait Load -where - for<'de> Self: Deserialize<'de> + Sized, -{ - /// # Errors - /// - /// Will return `Err` if it failed to open the specified file, - /// like read permission error or file no exist - fn from_path + fmt::Display>(path: P) -> Result { - let file = File::open(&path)?; - let cfg = serde_json::from_reader(BufReader::new(file))?; - Ok(cfg) - } -} - -pub trait Save -where - Self: Serialize, -{ - /// # Errors - /// - /// Will return `Err` if it failed to save the specified file, - /// like write permission error or file no exist - fn save + fmt::Display>(&self, path: P) -> Result<()> { - let file = File::create(&path)?; - if let Err(e) = self.serialize(&mut serde_json::Serializer::pretty(file)) { - Err(anyhow!(format!("failed to save {}: {}", path, e))) - } else { - Ok(()) - } - } -} - -#[derive(Serialize, Deserialize, Debug, Default)] -pub struct TdDb { - #[serde(alias = "Version", rename(serialize = "Version"))] - version: String, - #[serde( - alias = "Threat Descriptions", - rename(serialize = "Threat Descriptions") - )] - threatdescriptions: Vec, -} - -impl Load for TdDb {} -impl Save for TdDb {} -impl TdDb { - #[must_use] - pub fn new(version: String, threatdescriptions: Vec) -> Self { - TdDb { - version, - threatdescriptions, - } - } - - #[must_use] - pub fn labels_from_tdb(&self) -> Vec