diff --git a/.gitignore b/.gitignore index a4d2b56..d2a6756 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Build dir +/examples/target /target # Dev env configs diff --git a/Cargo.lock b/Cargo.lock index 26cca91..9f01345 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,11 +2,58 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "indexmap" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + [[package]] name = "log-surgeon" version = "0.0.1" dependencies = [ "regex-syntax", + "serde_yaml", +] + +[[package]] +name = "proc-macro2" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", ] [[package]] @@ -14,3 +61,65 @@ name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "serde" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "syn" +version = "2.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" diff --git a/Cargo.toml b/Cargo.toml index 6f6d0fd..2336504 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,4 @@ edition = "2021" [dependencies] regex-syntax = "0.8.5" +serde_yaml = "0.9.34" diff --git a/examples/Cargo.lock b/examples/Cargo.lock new file mode 100644 index 0000000..6cb9d1e --- /dev/null +++ b/examples/Cargo.lock @@ -0,0 +1,139 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "indexmap" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "log-surgeon" +version = "0.0.1" +dependencies = [ + "log", + "regex-syntax", + "serde_yaml", +] + +[[package]] +name = "proc-macro2" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "sample_program" +version = "0.1.0" +dependencies = [ + "log-surgeon", +] + +[[package]] +name = "serde" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "syn" +version = "2.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" diff --git a/examples/Cargo.toml b/examples/Cargo.toml new file mode 100644 index 0000000..b814742 --- /dev/null +++ b/examples/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "sample_program" +version = "0.1.0" +edition = "2021" + +[dependencies] +log-surgeon = { path = ".." } diff --git a/examples/logs/simple.log b/examples/logs/simple.log new file mode 100644 index 0000000..904049f --- /dev/null +++ b/examples/logs/simple.log @@ -0,0 +1,8 @@ +This log event doesn't have a timestamp +TIMESTAMP Id: 3190; This is a +multi-line log event with unicode: 这是一个有多行的日志 +TIMESTAMP Id: 0; This is a multi-line log event. I will pay +you 1000 dollars to test this file. +TIMESTAMP Id: 0; This is a variable=0 +TIMESTAMP Id: 0; But this is:0 +TIMESTAMP Variable with delimiter: a b a b a a a a diff --git a/examples/schema.yaml b/examples/schema.yaml new file mode 100644 index 0000000..42b415f --- /dev/null +++ b/examples/schema.yaml @@ -0,0 +1,15 @@ +timestamp: + # E.g. 2015-01-31T15:50:45.392 + - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}' + # E.g. 2015-01-31T15:50:45,392 + - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2},\d{3}' + # E.g. 2015-01-31 15:50:45 + - '\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}' + +delimiters: " \t\r\n:,!;%" + +variables: + int: '\-{0,1}\d+' + float: '\-{0,1}[0-9]+\.[0-9]+' + hex: '(0x){0,1}([0-9a-f]+)|([0-9A-F]+)' + loglevel: '(INFO)|(DEBUG)|(WARN)|(ERROR)|(TRACE)|(FATAL)' diff --git a/examples/schema_simple.yaml b/examples/schema_simple.yaml new file mode 100644 index 0000000..81ae6a9 --- /dev/null +++ b/examples/schema_simple.yaml @@ -0,0 +1,8 @@ +timestamp: + - 'TIMESTAMP' + +delimiters: " \t\r\n:,!;%" + +variables: + int: '\-{0,1}\d+' + with_delimiter: 'a a' diff --git a/examples/src/main.rs b/examples/src/main.rs new file mode 100644 index 0000000..6a6dc0d --- /dev/null +++ b/examples/src/main.rs @@ -0,0 +1,24 @@ +use log_surgeon::error_handling::Result; +use log_surgeon::parser::SchemaConfig; +use log_surgeon::log_parser::LogEvent; +use log_surgeon::log_parser::LogParser; + +use std::rc::Rc; + +fn main() -> Result<()> { + let project_root = env!("CARGO_MANIFEST_DIR"); + let schema_path = std::path::Path::new(project_root).join("schema_simple.yaml"); + let log_path = std::path::Path::new(project_root) + .join("logs") + .join("simple.log"); + + let parsed_schema = Rc::new(SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?); + let mut log_parser = LogParser::new(parsed_schema.clone())?; + log_parser.set_input_file(log_path.to_str().unwrap())?; + + while let Some(log_event) = log_parser.parse_next_log_event()? { + println!("{:?}", log_event); + } + + Ok(()) +} diff --git a/src/dfa/dfa.rs b/src/dfa/dfa.rs index 359dc51..0cf8986 100644 --- a/src/dfa/dfa.rs +++ b/src/dfa/dfa.rs @@ -1,10 +1,11 @@ use crate::nfa::nfa::NFA; use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; use std::hash::Hash; use std::rc::Rc; #[derive(Clone, Debug, Eq, Hash, PartialEq)] -struct State(usize); +pub struct State(usize); enum Tag { Start(usize), @@ -18,6 +19,31 @@ struct Transition { tag: Option, } +impl Debug for Transition { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + if 0 == self.symbol_onehot_encoding { + return write!( + f, + "{:?} -> {:?}, symbol: {}", + self.from_state, self.to_state, "epsilon" + ); + } + + let mut char_vec: Vec = Vec::new(); + for i in 0..128u8 { + let mask = 1u128 << i; + if mask & self.symbol_onehot_encoding == mask { + char_vec.push(i as char); + } + } + write!( + f, + "{:?} -> {:?}, symbol: {:?}", + self.from_state, self.to_state, char_vec + ) + } +} + pub(crate) struct DFA { start: State, accept: Vec, @@ -26,6 +52,29 @@ pub(crate) struct DFA { dfa_to_accepted_nfa_state_mapping: Vec>, // to determine which NFA gets matched } +impl Debug for DFA { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "NFA( start: {:?}, accept: {:?}, states: {:?}, transitions: {{\n", + self.start, self.accept, self.states + )?; + + for state in &self.states { + let state_idx = state.0; + if self.transitions[state_idx].is_empty() { + continue; + } + write!(f, "\t{:?}:\n", state)?; + for (_, transition) in &self.transitions[state_idx] { + write!(f, "\t\t{:?}\n", transition)?; + } + } + + write!(f, "}} )") + } +} + pub(crate) struct DfaSimulator { dfa: Rc, current_state: State, @@ -167,7 +216,29 @@ impl DFA { } impl DFA { - fn from_multiple_nfas(nfas: Vec) -> DFA { + pub fn get_next_state(&self, state: State, c: u8) -> Option { + // No bound check + let transitions = &self.transitions[state.0]; + let mask = 1u128 << c; + for (transition_symbol, transition) in transitions.iter() { + if mask & transition_symbol == mask { + return Some(transition.to_state.clone()); + } + } + None + } + + pub fn is_accept_state(&self, state: State) -> Option { + self.get_accept_nfa_state(state.0) + } + + pub fn get_root(&self) -> State { + self.start.clone() + } +} + +impl DFA { + pub fn from_multiple_nfas(nfas: Vec) -> DFA { // All of the nodes now have a pair of identifiers, // 1. the NFA index within the list of NFAs // 2. the NFA state index within the NFA @@ -266,7 +337,6 @@ impl DFA { if !l_nfa_states_to_dfa_mapping.contains_key(&destination_nfa_states) { // We need to add a new state to the DFA let destination_dfa_state_idx = dfa_states.len(); - println!("Inserting State {}", destination_dfa_state_idx); dfa_states.push(State(destination_dfa_state_idx)); dfa_transitions.push(HashMap::new()); @@ -306,21 +376,21 @@ impl DFA { } impl DfaSimulator { - fn new(dfa: Rc) -> Self { + pub fn new(dfa: Rc) -> Self { DfaSimulator { dfa: dfa.clone(), current_state: dfa.start.clone(), } } - fn reset_simulation(&mut self) { + pub fn reset_simulation(&mut self) { self.current_state = self.dfa.start.clone(); } // Simulate the DFA with a single character // Returns the next state and whether the current state is a valid state // invalid state means that the DFA has reached a dead end - fn simulate_single_char(&mut self, input: char) -> (Option, bool) { + pub fn simulate_single_char(&mut self, input: char) -> (Option, bool) { let transitions = self.dfa.transitions.get(self.current_state.0); if transitions.is_none() { @@ -580,4 +650,115 @@ mod tests { Ok(()) } + + #[test] + fn test_int() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"\-{0,1}\d+")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + + assert_eq!(dfa.simulate("0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("1234"), (Some(0usize), true)); + assert_eq!(dfa.simulate("-1234"), (Some(0usize), true)); + assert_eq!(dfa.simulate("-0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("aba"), (None, false)); + assert_eq!(dfa.simulate(""), (None, false)); + assert_eq!(dfa.simulate("3.14"), (None, false)); + assert_eq!(dfa.simulate("0.00"), (None, false)); + + Ok(()) + } + + #[test] + fn test_float() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"\-{0,1}[0-9]+\.\d+")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + + assert_eq!(dfa.simulate("0.0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("-0.0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("-0.00001"), (Some(0usize), true)); + assert_eq!(dfa.simulate("0.00001"), (Some(0usize), true)); + assert_eq!(dfa.simulate("3.1415926"), (Some(0usize), true)); + assert_eq!(dfa.simulate("-3.1415926"), (Some(0usize), true)); + + assert_eq!(dfa.simulate("0"), (None, false)); + assert_eq!(dfa.simulate("1234"), (None, false)); + assert_eq!(dfa.simulate("-1234"), (None, false)); + assert_eq!(dfa.simulate("-0"), (None, false)); + assert_eq!(dfa.simulate("aba"), (None, false)); + assert_eq!(dfa.simulate(""), (None, false)); + + Ok(()) + } + + #[test] + fn test_hex() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"(0x){0,1}(((\d|[a-f])+)|((\d|[A-F])+))")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + println!("{:?}", nfa); + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + println!("{:?}", dfa); + + assert_eq!(dfa.simulate("0x0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("0"), (Some(0usize), true)); + assert_eq!(dfa.simulate("1234"), (Some(0usize), true)); + assert_eq!(dfa.simulate("0x1A2B3C4D5E6F7890"), (Some(0usize), true)); + assert_eq!(dfa.simulate("0x1a2b3c4d5e6f7890"), (Some(0usize), true)); + assert_eq!(dfa.simulate("1a2b3c4d5e6f7890"), (Some(0usize), true)); + assert_eq!(dfa.simulate("abcdef"), (Some(0usize), true)); + assert_eq!(dfa.simulate("abcdefg"), (None, false)); + assert_eq!(dfa.simulate("aBa"), (None, false)); + assert_eq!(dfa.simulate(""), (None, false)); + assert_eq!(dfa.simulate("3.14"), (None, false)); + assert_eq!(dfa.simulate("0.00"), (None, false)); + + Ok(()) + } + + #[test] + fn test_timestamp() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + println!("{:?}", nfa); + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + println!("{:?}", dfa); + + assert_eq!(dfa.simulate("2015-01-31T15:50:45.39"), (Some(0usize), true)); + + Ok(()) + } + + #[test] + fn test_static_text() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"TIMESTAMP")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + println!("{:?}", nfa); + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + println!("{:?}", dfa); + + assert_eq!(dfa.simulate("TIMESTAMP"), (Some(0usize), true)); + + Ok(()) + } } diff --git a/src/dfa/mod.rs b/src/dfa/mod.rs index 4cccab2..e7f5131 100644 --- a/src/dfa/mod.rs +++ b/src/dfa/mod.rs @@ -1 +1,5 @@ mod dfa; + +pub(crate) use dfa::DfaSimulator; +pub(crate) use dfa::State; +pub(crate) use dfa::DFA; diff --git a/src/error_handling/error.rs b/src/error_handling/error.rs index 4d3bf94..36163d5 100644 --- a/src/error_handling/error.rs +++ b/src/error_handling/error.rs @@ -3,6 +3,8 @@ use regex_syntax::ast; #[derive(Debug)] pub enum Error { RegexParsingError(ast::Error), + YamlParsingError(serde_yaml::Error), + IOError(std::io::Error), UnsupportedAstNodeType(&'static str), NoneASCIICharacters, NegationNotSupported(&'static str), @@ -10,6 +12,12 @@ pub enum Error { UnsupportedAstBracketedKind, UnsupportedClassSetType, UnsupportedGroupKindType, + MissingSchemaKey(&'static str), + LexerInputStreamNotSet, + LexerStateUnknown, + LexerInternalErr(&'static str), + LogParserInternalErr(&'static str), + InvalidSchema, } pub type Result = std::result::Result; diff --git a/src/lexer/lexer.rs b/src/lexer/lexer.rs new file mode 100644 index 0000000..02f79d9 --- /dev/null +++ b/src/lexer/lexer.rs @@ -0,0 +1,408 @@ +use crate::dfa::{State, DFA}; +use crate::error_handling::Error::{LexerInputStreamNotSet, LexerInternalErr, LexerStateUnknown}; +use crate::error_handling::Result; +use crate::lexer::LexerStream; +use crate::nfa::nfa::NFA; +use crate::parser::SchemaConfig; +use std::collections::VecDeque; +use std::fmt::Debug; +use std::rc::Rc; + +enum LexerState { + SeekingToTheNextDelimiter, + HandleDelimiter, + DFANotAccepted, + DFAAccepted, + VarExtract, + ParsingTimestamp, + EndOfStream, +} + +pub struct Lexer { + schema_config: Rc, + ts_dfa: DFA, + var_dfa: DFA, + + state: LexerState, + dfa_state: State, + + input_stream: Option>, + buf: Vec, + buf_cursor_pos: usize, + token_queue: VecDeque, + + last_delimiter: Option, + last_tokenized_pos: usize, + match_start_pos: usize, + match_end_pos: usize, + line_num: usize, +} + +#[derive(Clone, Debug)] +pub enum TokenType { + Timestamp(usize), + Variable(usize), + StaticText, + StaticTextWithEndLine, + End, +} + +pub struct Token { + val: String, + token_type: TokenType, + line_num: usize, +} + +impl Debug for Token { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "[{:?}|{}]: \"{}\"", self.token_type, self.line_num, self.val.escape_default()) + } +} + +impl Token { + pub fn get_val(&self) -> &str { + self.val.as_str() + } + + pub fn get_token_type(&self) -> TokenType { + self.token_type.clone() + } + + pub fn get_line_num(&self) -> usize { + self.line_num + } +} + +impl Lexer { + const MIN_BUF_GARBAGE_COLLECTION_SIZE: usize = 4096; + + pub fn new(schema_mgr: Rc) -> Result { + let mut ts_nfas: Vec = Vec::new(); + for schema in schema_mgr.get_ts_schemas() { + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(schema.get_ast(), nfa.get_start(), nfa.get_accept())?; + ts_nfas.push(nfa); + } + let ts_dfa = DFA::from_multiple_nfas(ts_nfas); + + let mut var_nfas: Vec = Vec::new(); + for schema in schema_mgr.get_var_schemas() { + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(schema.get_ast(), nfa.get_start(), nfa.get_accept())?; + var_nfas.push(nfa); + } + let var_dfa = DFA::from_multiple_nfas(var_nfas); + let var_dfa_root = var_dfa.get_root(); + + Ok(Self { + schema_config: schema_mgr, + ts_dfa, + var_dfa, + state: LexerState::ParsingTimestamp, + dfa_state: var_dfa_root, + input_stream: None, + buf: Vec::new(), + buf_cursor_pos: 0, + token_queue: VecDeque::new(), + last_delimiter: None, + last_tokenized_pos: 0, + match_start_pos: 0, + match_end_pos: 0, + line_num: 0, + }) + } + + fn reset(&mut self) { + self.input_stream = None; + self.buf.clear(); + self.buf_cursor_pos = 0; + self.token_queue.clear(); + self.last_delimiter = None; + self.last_tokenized_pos = 0; + self.match_start_pos = 0; + self.match_end_pos = 0; + self.line_num = 0; + self.state = LexerState::ParsingTimestamp; + } + + pub fn set_input_stream(&mut self, input_stream: Box) { + self.reset(); + self.input_stream = Some(input_stream); + self.state = LexerState::ParsingTimestamp; + } + + pub fn get_next_token(&mut self) -> Result> { + if self.input_stream.is_none() { + return Err(LexerInputStreamNotSet); + } + if self.token_queue.is_empty() { + self.fill_token_queue()?; + } + Ok(self.token_queue.pop_front()) + } + + fn fill_token_queue(&mut self) -> Result<()> { + loop { + match self.state { + LexerState::SeekingToTheNextDelimiter => match self.get_next_char_from_buffer()? { + Some(c) => { + if self.schema_config.has_delimiter(c) { + self.last_delimiter = Some(c); + self.state = LexerState::HandleDelimiter; + } + } + None => { + self.state = LexerState::EndOfStream; + } + }, + + LexerState::HandleDelimiter => { + if self.last_delimiter.is_none() { + return Err(LexerInternalErr("Delimiter not set")); + } + + let delimiter = self.last_delimiter.unwrap(); + self.last_delimiter = None; + match delimiter { + '\n' => { + self.generate_token( + self.buf_cursor_pos, + TokenType::StaticTextWithEndLine, + )?; + self.line_num += 1; + self.state = LexerState::ParsingTimestamp; + } + _ => self.proceed_to_var_dfa_simulation(), + } + } + + LexerState::ParsingTimestamp => { + if self.try_parse_timestamp()? { + self.state = LexerState::SeekingToTheNextDelimiter; + } else { + self.proceed_to_var_dfa_simulation(); + } + } + + LexerState::DFANotAccepted => match self.get_next_char_from_buffer()? { + Some(c) => { + self.simulate_var_dfa_and_set_lexer_state(c, LexerState::HandleDelimiter) + } + None => self.state = LexerState::EndOfStream, + }, + + LexerState::DFAAccepted => { + // Set match end (exclusive to the matched position) + self.match_end_pos = self.buf_cursor_pos; + match self.get_next_char_from_buffer()? { + Some(c) => { + self.simulate_var_dfa_and_set_lexer_state(c, LexerState::VarExtract) + } + None => self.state = LexerState::VarExtract, + } + } + + LexerState::VarExtract => { + if self.match_start_pos >= self.match_end_pos { + return Err(LexerInternalErr("Match end positions corrupted")); + } + if self.last_tokenized_pos > self.buf_cursor_pos { + return Err(LexerInternalErr("Match start position corrupted")); + } + + // Extract static text (if any) + if self.match_start_pos != self.last_tokenized_pos { + self.generate_token(self.match_start_pos, TokenType::StaticText)?; + } + + // Extract variable + match self.var_dfa.is_accept_state(self.dfa_state.clone()) { + Some(schema_id) => { + assert_eq!(self.match_start_pos, self.last_tokenized_pos); + self.generate_token( + self.match_end_pos, + TokenType::Variable(schema_id), + )?; + } + None => { + return Err(LexerInternalErr( + "DFA state doesn't stop in an accepted state", + )) + } + } + + match self.last_delimiter { + Some(_) => self.state = LexerState::HandleDelimiter, + None => self.state = LexerState::EndOfStream, + } + } + + LexerState::EndOfStream => { + if self.buf_cursor_pos > self.last_tokenized_pos { + let token_type = if self.last_delimiter.is_some() + && self.last_delimiter.unwrap() == '\n' + { + // TODO: This seems not possible.. + TokenType::StaticTextWithEndLine + } else { + TokenType::StaticText + }; + self.generate_token(self.buf_cursor_pos, token_type)?; + } + break; + } + } + + if false == self.token_queue.is_empty() { + break; + } + } + + self.buffer_garbage_collection(); + Ok(()) + } + + fn try_parse_timestamp(&mut self) -> Result { + let buf_cursor_pos_bookmark = self.buf_cursor_pos; + if buf_cursor_pos_bookmark != self.last_tokenized_pos { + return Err(LexerInternalErr("Timestamp parsing corrupted")); + } + let mut curr_dfa_state = self.ts_dfa.get_root(); + + // (Timestamp schema ID, position) + let mut last_matched: Option<(usize, usize)> = None; + + loop { + let optional_c = self.get_next_char_from_buffer()?; + if optional_c.is_none() { + break; + } + + let c = optional_c.unwrap(); + if false == c.is_ascii() { + break; + } + + let optional_next_state = self.ts_dfa.get_next_state(curr_dfa_state.clone(), c as u8); + if optional_next_state.is_none() { + break; + } + curr_dfa_state = optional_next_state.unwrap(); + + match self.ts_dfa.is_accept_state(curr_dfa_state.clone()) { + Some(ts_schema_id) => last_matched = Some((ts_schema_id, self.buf_cursor_pos)), + None => {} + } + } + + match last_matched { + Some((ts_schema_id, pos)) => { + self.generate_token(pos, TokenType::Timestamp(ts_schema_id))?; + self.buf_cursor_pos = pos; + Ok(true) + } + None => { + self.buf_cursor_pos = buf_cursor_pos_bookmark; + Ok(false) + } + } + } + + fn get_next_char_from_buffer(&mut self) -> Result> { + let pos = self.buf_cursor_pos; + if pos == self.buf.len() { + match self + .input_stream + .as_mut() + .unwrap() + .as_mut() + .get_next_char()? + { + Some(c) => self.buf.push(c), + None => return Ok(None), + } + } + let pos = self.get_and_increment_buf_cursor_pos(); + Ok(Some(self.buf[pos])) + } + + fn capture_delimiter(&mut self, c: char) -> bool { + if self.schema_config.has_delimiter(c) { + self.last_delimiter = Some(c); + return true; + } + false + } + + fn simulate_var_dfa_and_set_lexer_state(&mut self, c: char, delimiter_dst_state: LexerState) { + if false == c.is_ascii() { + self.state = LexerState::SeekingToTheNextDelimiter; + return; + } + match self.var_dfa.get_next_state(self.dfa_state.clone(), c as u8) { + Some(next_dfa_state) => { + self.dfa_state = next_dfa_state; + match self.var_dfa.is_accept_state(self.dfa_state.clone()) { + Some(_) => self.state = LexerState::DFAAccepted, + None => self.state = LexerState::DFANotAccepted, + } + } + None => { + self.state = if self.capture_delimiter(c) { + delimiter_dst_state + } else { + LexerState::SeekingToTheNextDelimiter + }; + } + } + } + + fn proceed_to_var_dfa_simulation(&mut self) { + self.match_start_pos = self.buf_cursor_pos; + self.dfa_state = self.var_dfa.get_root(); + self.state = LexerState::DFANotAccepted; + } + + fn generate_token(&mut self, end_pos: usize, token_type: TokenType) -> Result<()> { + if end_pos <= self.last_tokenized_pos { + return Err(LexerInternalErr("Tokenization end position corrupted")); + } + self.token_queue.push_back(Token { + val: self.buf[self.last_tokenized_pos..end_pos].iter().collect(), + line_num: self.line_num, + token_type, + }); + self.last_tokenized_pos = end_pos; + Ok(()) + } + + fn get_and_increment_buf_cursor_pos(&mut self) -> usize { + let curr_pos = self.buf_cursor_pos; + self.buf_cursor_pos += 1; + curr_pos + } + + fn set_buf_cursor_pos(&mut self, pos: usize) { + self.buf_cursor_pos = pos; + } + + fn buffer_garbage_collection(&mut self) { + if self.last_tokenized_pos <= self.buf.len() / 2 + || self.last_tokenized_pos <= Self::MIN_BUF_GARBAGE_COLLECTION_SIZE + { + return; + } + + let mut dst_idx = 0usize; + let mut src_idx = self.last_tokenized_pos; + while src_idx < self.buf.len() { + self.buf[dst_idx] = self.buf[src_idx]; + dst_idx += 1; + src_idx += 1; + } + self.buf.resize(dst_idx, 0 as char); + self.buf_cursor_pos -= self.last_tokenized_pos; + self.last_tokenized_pos = 0; + // No need to reset match_start/end + } +} diff --git a/src/lexer/lexer_stream.rs b/src/lexer/lexer_stream.rs new file mode 100644 index 0000000..f7ab114 --- /dev/null +++ b/src/lexer/lexer_stream.rs @@ -0,0 +1,5 @@ +use crate::error_handling::Result; + +pub trait LexerStream { + fn get_next_char(&mut self) -> Result>; +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 0000000..598dde8 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,9 @@ +mod lexer; +mod lexer_stream; +mod streams; + +pub use lexer::Lexer; +pub use lexer::Token; +pub use lexer::TokenType; +pub use lexer_stream::LexerStream; +pub use streams::BufferedFileStream; diff --git a/src/lexer/streams.rs b/src/lexer/streams.rs new file mode 100644 index 0000000..f1e66d1 --- /dev/null +++ b/src/lexer/streams.rs @@ -0,0 +1,49 @@ +use super::lexer_stream::LexerStream; +use crate::error_handling::Error::IOError; +use crate::error_handling::Result; +use std::io::BufRead; + +pub struct BufferedFileStream { + line_it: std::io::Lines>, + line: Option>, + pos: usize, +} + +impl BufferedFileStream { + pub fn new(path: &str) -> Result { + match std::fs::File::open(path) { + Ok(file) => Ok(Self { + line_it: std::io::BufReader::new(file).lines(), + line: None, + pos: 0, + }), + Err(e) => Err(IOError(e)), + } + } +} + +impl LexerStream for BufferedFileStream { + fn get_next_char(&mut self) -> Result> { + if self.line.is_none() { + let next_line = self.line_it.next(); + if next_line.is_none() { + return Ok(None); + } + match next_line.unwrap() { + Ok(line) => { + self.line = Some(line.chars().collect()); + self.line.as_mut().unwrap().push('\n'); + self.pos = 0; + } + Err(e) => return Err(IOError(e)), + } + } + + let c = self.line.as_ref().unwrap()[self.pos]; + self.pos += 1; + if self.pos == self.line.as_ref().unwrap().len() { + self.line = None; + } + Ok(Some(c)) + } +} diff --git a/src/lib.rs b/src/lib.rs index 6321be9..295bfe4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ mod dfa; -mod error_handling; +pub mod error_handling; +pub mod lexer; +pub mod log_parser; mod nfa; pub mod parser; diff --git a/src/log_parser/log_parser.rs b/src/log_parser/log_parser.rs new file mode 100644 index 0000000..805226c --- /dev/null +++ b/src/log_parser/log_parser.rs @@ -0,0 +1,145 @@ +use std::fmt::Debug; +use crate::error_handling::Error::LogParserInternalErr; +use crate::error_handling::Result; +use crate::lexer::BufferedFileStream; +use crate::lexer::LexerStream; +use crate::lexer::{Lexer, Token, TokenType}; +use crate::parser::SchemaConfig; +use std::rc::Rc; + +pub struct LogParser { + lexer: Lexer, + schema_config: Rc, + tokens: Option>, +} + +pub struct LogEvent { + tokens: Vec, + line_range: (usize, usize), + has_timestamp: bool, + schema_config: Rc, +} + +impl LogParser { + pub fn new(schema_config: Rc) -> Result { + let lexer = Lexer::new(schema_config.clone())?; + Ok((Self { + lexer, + schema_config, + tokens: Some(Vec::new()), + })) + } + + pub fn set_input_file(&mut self, path: &str) -> Result<()> { + self.tokens = Some(Vec::new()); + let buffered_file_stream = Box::new(BufferedFileStream::new(path)?); + self.set_input_stream(buffered_file_stream) + } + + pub fn set_input_stream(&mut self, input_stream: Box) -> Result<()> { + self.lexer.set_input_stream(input_stream); + Ok(()) + } + + pub fn parse_next_log_event(&mut self) -> Result> { + loop { + match self.lexer.get_next_token()? { + Some(token) => match token.get_token_type() { + TokenType::Timestamp(_) => { + if self.tokens.is_none() { + self.buffer_token(token); + continue; + } + let log_event = self.emit_buffered_tokens_as_log_event()?; + self.buffer_token(token); + return Ok(log_event); + } + _ => self.buffer_token(token), + }, + None => break, + } + } + self.emit_buffered_tokens_as_log_event() + } + + fn buffer_token(&mut self, token: Token) { + if self.tokens.is_none() { + self.tokens = Some(Vec::new()); + } + self.tokens.as_mut().unwrap().push(token); + } + + fn emit_buffered_tokens_as_log_event(&mut self) -> Result> { + match &self.tokens { + Some(_) => { + let tokens = self.tokens.take().unwrap(); + LogEvent::new(self.schema_config.clone(), tokens) + } + None => Ok(None), + } + } +} + +impl LogEvent { + fn new(schema_config: Rc, tokens: Vec) -> Result> { + if tokens.is_empty() { + return Err(LogParserInternalErr("The given token vector is empty")); + } + let has_timestamp = match tokens.first().unwrap().get_token_type() { + TokenType::Timestamp(_) => true, + _ => false, + }; + let line_range = ( + tokens.first().unwrap().get_line_num(), + tokens.last().unwrap().get_line_num(), + ); + Ok(Some( + (Self { + tokens, + line_range, + has_timestamp, + schema_config, + }), + )) + } + + pub fn get_timestamp_token(&self) -> Option<&Token> { + match self.has_timestamp { + true => Some(&self.tokens[0]), + false => None, + } + } + + pub fn get_line_range(&self) -> (usize, usize) { + self.line_range + } + + pub fn get_log_message_tokens(&self) -> &[Token] { + match self.has_timestamp { + true => &self.tokens[1..], + false => &self.tokens[..], + } + } +} + +impl Debug for LogEvent { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut result = String::new(); + match self.get_timestamp_token() { + Some(ts_token) => result += format!("Timestamp:\n\t{:?}\n", ts_token).as_str(), + None => result += "Timestamp:\n\tNONE\n", + } + + let (mut curr_line_num, _) = self.get_line_range(); + result += format!("Line {}:\n", curr_line_num).as_str(); + for token in self.get_log_message_tokens() { + if token.get_line_num() != curr_line_num { + curr_line_num = token.get_line_num(); + result += format!("Line {}:\n", curr_line_num).as_str(); + } + result += format!("\t{:?}\n", token).as_str(); + } + + write!(f, "{}", result) + } +} diff --git a/src/log_parser/mod.rs b/src/log_parser/mod.rs new file mode 100644 index 0000000..08947fb --- /dev/null +++ b/src/log_parser/mod.rs @@ -0,0 +1,4 @@ +mod log_parser; + +pub use log_parser::LogParser; +pub use log_parser::LogEvent; diff --git a/src/nfa/nfa.rs b/src/nfa/nfa.rs index 4c96f39..056c4c7 100644 --- a/src/nfa/nfa.rs +++ b/src/nfa/nfa.rs @@ -35,10 +35,21 @@ pub struct Transition { impl Debug for Transition { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + if 0 == self.symbol_onehot_encoding { + return write!(f, "{:?} -> {:?}, symbol: {}", self.from, self.to, "epsilon"); + } + + let mut char_vec: Vec = Vec::new(); + for i in 0..128u8 { + let mask = 1u128 << i; + if mask & self.symbol_onehot_encoding == mask { + char_vec.push(i as char); + } + } write!( f, "{:?} -> {:?}, symbol: {:?}", - self.from, self.to, self.symbol_onehot_encoding + self.from, self.to, char_vec ) } } @@ -364,12 +375,17 @@ impl Debug for NFA { "NFA( start: {:?}, accept: {:?}, states: {:?}, transitions: {{\n", self.start, self.accept, self.states )?; - for (state, transitions) in &self.transitions { + + for state in &self.states { + if false == self.transitions.contains_key(state) { + continue; + } write!(f, "\t{:?}:\n", state)?; - for transition in transitions { + for transition in self.transitions.get(state).unwrap() { write!(f, "\t\t{:?}\n", transition)?; } } + write!(f, "}} )") } } @@ -1071,7 +1087,7 @@ mod tests { #[test] fn test_floating_point_regex() -> Result<()> { let mut parser = RegexParser::new(); - let parsed_ast = parser.parse_into_ast(r"\-{0,1}[0-9]+\.[0-9]+")?; + let parsed_ast = parser.parse_into_ast(r"\-{0,1}[0-9]+\.\d+")?; let mut nfa = NFA::new(); nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 146fa7a..4dfa70a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1 +1,7 @@ pub(crate) mod regex_parser; + +mod schema_parser; + +pub use schema_parser::parser::SchemaConfig; +pub use schema_parser::parser::TimestampSchema; +pub use schema_parser::parser::VarSchema; diff --git a/src/parser/schema_parser/mod.rs b/src/parser/schema_parser/mod.rs new file mode 100644 index 0000000..67c567f --- /dev/null +++ b/src/parser/schema_parser/mod.rs @@ -0,0 +1 @@ +pub mod parser; diff --git a/src/parser/schema_parser/parser.rs b/src/parser/schema_parser/parser.rs new file mode 100644 index 0000000..851e907 --- /dev/null +++ b/src/parser/schema_parser/parser.rs @@ -0,0 +1,198 @@ +use crate::error_handling::Error::{ + IOError, InvalidSchema, MissingSchemaKey, NoneASCIICharacters, YamlParsingError, +}; +use crate::error_handling::Result; +use crate::parser::regex_parser::parser::RegexParser; +use regex_syntax::ast::Ast; +use serde_yaml::Value; +use std::collections::{HashMap, HashSet}; +use std::io::Read; + +pub struct TimestampSchema { + regex: String, + ast: Ast, +} + +impl TimestampSchema { + pub fn new(regex: String) -> Result { + let mut regex_parser = RegexParser::new(); + let ast = regex_parser.parse_into_ast(regex.as_str())?; + Ok(Self { regex, ast }) + } + + pub fn get_regex(&self) -> &str { + &self.regex + } + + pub fn get_ast(&self) -> &Ast { + &self.ast + } +} + +pub struct VarSchema { + pub name: String, + pub regex: String, + pub ast: Ast, +} + +impl VarSchema { + pub fn new(name: String, regex: String) -> Result { + let mut regex_parser = RegexParser::new(); + let ast = regex_parser.parse_into_ast(regex.as_str())?; + Ok(Self { name, regex, ast }) + } + + pub fn get_name(&self) -> &str { + &self.name + } + + pub fn get_regex(&self) -> &str { + &self.regex + } + + pub fn get_ast(&self) -> &Ast { + &self.ast + } +} + +pub struct SchemaConfig { + ts_schemas: Vec, + var_schemas: Vec, + delimiters: [bool; 128], +} + +impl SchemaConfig { + pub fn get_ts_schemas(&self) -> &Vec { + &self.ts_schemas + } + + pub fn get_var_schemas(&self) -> &Vec { + &self.var_schemas + } + + pub fn has_delimiter(&self, delimiter: char) -> bool { + if false == delimiter.is_ascii() { + return false; + } + self.delimiters[delimiter as usize] + } +} + +impl SchemaConfig { + const TIMESTAMP_KEY: &'static str = "timestamp"; + const VAR_KEY: &'static str = "variables"; + const DELIMITER_EKY: &'static str = "delimiters"; + + pub fn parse_from_str(yaml_content: &str) -> Result { + match Self::load_kv_pairs_from_yaml_content(yaml_content) { + Ok(kv_pairs) => Self::load_from_kv_pairs(kv_pairs), + Err(e) => Err(YamlParsingError(e)), + } + } + + pub fn parse_from_file(yaml_file_path: &str) -> Result { + match std::fs::File::open(yaml_file_path) { + Ok(mut file) => { + let mut contents = String::new(); + if let Err(e) = file.read_to_string(&mut contents) { + return Err(IOError(e)); + } + Self::parse_from_str(contents.as_str()) + } + Err(e) => Err(IOError(e)), + } + } + + fn get_key_value<'a>( + kv_map: &'a HashMap, + key: &'static str, + ) -> Result<&'a Value> { + kv_map.get(key).ok_or_else(|| MissingSchemaKey(key)) + } + + fn load_kv_pairs_from_yaml_content( + yaml_content: &str, + ) -> serde_yaml::Result> { + let kv_map_result: HashMap = serde_yaml::from_str(&yaml_content)?; + Ok(kv_map_result) + } + + fn load_from_kv_pairs(kv_pairs: HashMap) -> Result { + // Handle timestamps + let mut ts_schemas: Vec = Vec::new(); + let timestamps = Self::get_key_value(&kv_pairs, Self::TIMESTAMP_KEY)?; + if let Value::Sequence(sequence) = timestamps { + sequence.iter().try_for_each(|val| { + if let Value::String(s) = val { + ts_schemas.push(TimestampSchema::new(s.clone())?); + Ok(()) + } else { + Err(InvalidSchema) + } + })?; + } else { + return Err(InvalidSchema); + } + + // Handle variables + let mut var_schemas: Vec = Vec::new(); + let vars = Self::get_key_value(&kv_pairs, Self::VAR_KEY)?; + if let Value::Mapping(map) = vars { + for (key, value) in map { + match (key, value) { + (Value::String(name), Value::String(regex)) => { + var_schemas.push(VarSchema::new(name.clone(), regex.clone())?); + } + _ => return Err(InvalidSchema), + } + } + } else { + return Err(InvalidSchema); + } + + // Handle delimiter + let mut delimiters = [false; 128]; + let delimiter = Self::get_key_value(&kv_pairs, Self::DELIMITER_EKY)?; + if let Value::String(delimiter_str) = delimiter { + for c in delimiter_str.chars() { + if false == c.is_ascii() { + return Err(NoneASCIICharacters); + } + delimiters[c as usize] = true; + } + } else { + return Err(InvalidSchema); + } + delimiters['\n' as usize] = true; + + Ok((Self { + ts_schemas, + var_schemas, + delimiters, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_read_example_schema_file() -> Result<()> { + let project_root = env!("CARGO_MANIFEST_DIR"); + let schema_path = std::path::Path::new(project_root) + .join("examples") + .join("schema.yaml"); + let parsed_schema = SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?; + + assert_eq!(parsed_schema.get_ts_schemas().len(), 3); + assert_eq!(parsed_schema.get_var_schemas().len(), 4); + + let delimiters: Vec = vec![' ', '\t', '\n', '\r', ':', ',', '!', ';', '%']; + for delimiter in delimiters { + assert!(parsed_schema.has_delimiter(delimiter)); + } + + Ok(()) + } +} diff --git a/tests/lexer_test.rs b/tests/lexer_test.rs new file mode 100644 index 0000000..f0232e4 --- /dev/null +++ b/tests/lexer_test.rs @@ -0,0 +1,57 @@ +use log_surgeon::error_handling::Result; +use log_surgeon::lexer::BufferedFileStream; +use log_surgeon::lexer::Lexer; +use log_surgeon::parser::SchemaConfig; + +use std::rc::Rc; +use std::fs::File; +use std::io::{self, BufRead}; + +#[test] +fn test_lexer_simple() -> Result<()> { + let project_root = env!("CARGO_MANIFEST_DIR"); + let schema_path = std::path::Path::new(project_root) + .join("examples") + .join("schema_simple.yaml"); + let log_path = std::path::Path::new(project_root) + .join("examples") + .join("logs") + .join("simple.log"); + + let parsed_schema = Rc::new(SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?); + let mut lexer = Lexer::new(parsed_schema)?; + let buffered_file_stream = Box::new(BufferedFileStream::new(log_path.to_str().unwrap())?); + lexer.set_input_stream(buffered_file_stream); + + let mut tokens = Vec::new(); + while let Some(token) = lexer.get_next_token()? { + tokens.push(token); + } + assert_eq!(false, tokens.is_empty()); + + let mut parsed_lines = Vec::new(); + let mut parsed_line = String::new(); + let mut curr_line_num = 0usize; + for token in &tokens { + if curr_line_num != token.get_line_num() { + parsed_lines.push(parsed_line.clone()); + parsed_line.clear(); + curr_line_num += 1; + } + parsed_line += &token.get_val().to_string(); + } + parsed_lines.push(parsed_line.clone()); + + let mut expected_lines = Vec::new(); + let reader = io::BufReader::new(File::open(log_path).expect("failed to open log file")); + for line in reader.lines() { + let line = line.expect("failed to read line"); + expected_lines.push(line + "\n"); + } + + assert_eq!(parsed_lines.len(), expected_lines.len()); + assert_eq!(false, parsed_line.is_empty()); + assert_eq!(parsed_lines, expected_lines); + + Ok(()) +}