diff --git a/examples/Cargo.lock b/examples/Cargo.lock index bcdec4f..6cb9d1e 100644 --- a/examples/Cargo.lock +++ b/examples/Cargo.lock @@ -30,10 +30,17 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + [[package]] name = "log-surgeon" version = "0.0.1" dependencies = [ + "log", "regex-syntax", "serde_yaml", ] diff --git a/examples/logs/simple.log b/examples/logs/simple.log index 269c5f5..904049f 100644 --- a/examples/logs/simple.log +++ b/examples/logs/simple.log @@ -1,8 +1,8 @@ This log event doesn't have a timestamp -2015-01-31T15:50:45.392 Id: 3190; This is a +TIMESTAMP Id: 3190; This is a multi-line log event with unicode: 这是一个有多行的日志 -2015-01-31T15:50:45.393 Id: 0; This is a multi-line log event. I will pay +TIMESTAMP Id: 0; This is a multi-line log event. I will pay you 1000 dollars to test this file. -2015-01-31T15:50:45.392 Id: 0; This is a variable=0 -2015-01-31T15:50:45.392 Id: 0; But this is:0 -2015-01-31T15:50:45.392 Variable with delimiter: a b a b a a a a +TIMESTAMP Id: 0; This is a variable=0 +TIMESTAMP Id: 0; But this is:0 +TIMESTAMP Variable with delimiter: a b a b a a a a diff --git a/examples/schema.yaml b/examples/schema.yaml index d8a0bb3..42b415f 100644 --- a/examples/schema.yaml +++ b/examples/schema.yaml @@ -1,6 +1,6 @@ timestamp: # E.g. 2015-01-31T15:50:45.392 - - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}' + - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}' # E.g. 2015-01-31T15:50:45,392 - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2},\d{3}' # E.g. 2015-01-31 15:50:45 diff --git a/examples/schema_simple.yaml b/examples/schema_simple.yaml index 97a29e4..81ae6a9 100644 --- a/examples/schema_simple.yaml +++ b/examples/schema_simple.yaml @@ -1,6 +1,5 @@ timestamp: - # E.g. 2015-01-31T15:50:45.392 - - '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}' + - 'TIMESTAMP' delimiters: " \t\r\n:,!;%" diff --git a/examples/src/main.rs b/examples/src/main.rs index ee23cb9..6a6dc0d 100644 --- a/examples/src/main.rs +++ b/examples/src/main.rs @@ -1,7 +1,9 @@ use log_surgeon::error_handling::Result; -use log_surgeon::lexer::BufferedFileStream; -use log_surgeon::lexer::Lexer; use log_surgeon::parser::SchemaConfig; +use log_surgeon::log_parser::LogEvent; +use log_surgeon::log_parser::LogParser; + +use std::rc::Rc; fn main() -> Result<()> { let project_root = env!("CARGO_MANIFEST_DIR"); @@ -10,19 +12,12 @@ fn main() -> Result<()> { .join("logs") .join("simple.log"); - let parsed_schema = SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?; - let mut lexer = Lexer::new(&parsed_schema)?; - let buffered_file_stream = Box::new(BufferedFileStream::new(log_path.to_str().unwrap())?); - lexer.set_input_stream(buffered_file_stream); - - let mut tokens = Vec::new(); - while let Some(token) = lexer.get_next_token()? { - tokens.push(token); - } - assert_eq!(false, tokens.is_empty()); + let parsed_schema = Rc::new(SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?); + let mut log_parser = LogParser::new(parsed_schema.clone())?; + log_parser.set_input_file(log_path.to_str().unwrap())?; - for token in &tokens { - println!("{:?}", token); + while let Some(log_event) = log_parser.parse_next_log_event()? { + println!("{:?}", log_event); } Ok(()) diff --git a/src/dfa/dfa.rs b/src/dfa/dfa.rs index 21fe54e..0cf8986 100644 --- a/src/dfa/dfa.rs +++ b/src/dfa/dfa.rs @@ -727,4 +727,38 @@ mod tests { Ok(()) } + + #[test] + fn test_timestamp() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + println!("{:?}", nfa); + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + println!("{:?}", dfa); + + assert_eq!(dfa.simulate("2015-01-31T15:50:45.39"), (Some(0usize), true)); + + Ok(()) + } + + #[test] + fn test_static_text() -> Result<()> { + let mut parser = RegexParser::new(); + let parsed_ast = parser.parse_into_ast(r"TIMESTAMP")?; + + let mut nfa = NFA::new(); + nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?; + println!("{:?}", nfa); + + let dfa = DFA::from_multiple_nfas(vec![nfa]); + println!("{:?}", dfa); + + assert_eq!(dfa.simulate("TIMESTAMP"), (Some(0usize), true)); + + Ok(()) + } } diff --git a/src/error_handling/error.rs b/src/error_handling/error.rs index 1a7c35d..36163d5 100644 --- a/src/error_handling/error.rs +++ b/src/error_handling/error.rs @@ -16,6 +16,7 @@ pub enum Error { LexerInputStreamNotSet, LexerStateUnknown, LexerInternalErr(&'static str), + LogParserInternalErr(&'static str), InvalidSchema, } diff --git a/src/lexer/lexer.rs b/src/lexer/lexer.rs index b7dbab8..02f79d9 100644 --- a/src/lexer/lexer.rs +++ b/src/lexer/lexer.rs @@ -5,7 +5,7 @@ use crate::lexer::LexerStream; use crate::nfa::nfa::NFA; use crate::parser::SchemaConfig; use std::collections::VecDeque; -use std::ffi::c_int; +use std::fmt::Debug; use std::rc::Rc; enum LexerState { @@ -18,8 +18,8 @@ enum LexerState { EndOfStream, } -pub struct Lexer<'a> { - schema_config: &'a SchemaConfig, +pub struct Lexer { + schema_config: Rc, ts_dfa: DFA, var_dfa: DFA, @@ -38,8 +38,8 @@ pub struct Lexer<'a> { line_num: usize, } -#[derive(Debug)] -enum TokenType { +#[derive(Clone, Debug)] +pub enum TokenType { Timestamp(usize), Variable(usize), StaticText, @@ -47,17 +47,36 @@ enum TokenType { End, } -#[derive(Debug)] pub struct Token { - pub val: String, - pub token_type: TokenType, - pub line_num: usize, + val: String, + token_type: TokenType, + line_num: usize, +} + +impl Debug for Token { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "[{:?}|{}]: \"{}\"", self.token_type, self.line_num, self.val.escape_default()) + } +} + +impl Token { + pub fn get_val(&self) -> &str { + self.val.as_str() + } + + pub fn get_token_type(&self) -> TokenType { + self.token_type.clone() + } + + pub fn get_line_num(&self) -> usize { + self.line_num + } } -impl<'a> Lexer<'a> { +impl Lexer { const MIN_BUF_GARBAGE_COLLECTION_SIZE: usize = 4096; - pub fn new(schema_mgr: &'a SchemaConfig) -> Result { + pub fn new(schema_mgr: Rc) -> Result { let mut ts_nfas: Vec = Vec::new(); for schema in schema_mgr.get_ts_schemas() { let mut nfa = NFA::new(); @@ -270,7 +289,7 @@ impl<'a> Lexer<'a> { } curr_dfa_state = optional_next_state.unwrap(); - match self.ts_dfa.is_accept_state(self.dfa_state.clone()) { + match self.ts_dfa.is_accept_state(curr_dfa_state.clone()) { Some(ts_schema_id) => last_matched = Some((ts_schema_id, self.buf_cursor_pos)), None => {} } diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 0c74aa7..598dde8 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -3,5 +3,7 @@ mod lexer_stream; mod streams; pub use lexer::Lexer; +pub use lexer::Token; +pub use lexer::TokenType; pub use lexer_stream::LexerStream; pub use streams::BufferedFileStream; diff --git a/src/lib.rs b/src/lib.rs index 8557bbb..295bfe4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ mod dfa; pub mod error_handling; pub mod lexer; +pub mod log_parser; mod nfa; pub mod parser; diff --git a/src/log_parser/log_parser.rs b/src/log_parser/log_parser.rs new file mode 100644 index 0000000..805226c --- /dev/null +++ b/src/log_parser/log_parser.rs @@ -0,0 +1,145 @@ +use std::fmt::Debug; +use crate::error_handling::Error::LogParserInternalErr; +use crate::error_handling::Result; +use crate::lexer::BufferedFileStream; +use crate::lexer::LexerStream; +use crate::lexer::{Lexer, Token, TokenType}; +use crate::parser::SchemaConfig; +use std::rc::Rc; + +pub struct LogParser { + lexer: Lexer, + schema_config: Rc, + tokens: Option>, +} + +pub struct LogEvent { + tokens: Vec, + line_range: (usize, usize), + has_timestamp: bool, + schema_config: Rc, +} + +impl LogParser { + pub fn new(schema_config: Rc) -> Result { + let lexer = Lexer::new(schema_config.clone())?; + Ok((Self { + lexer, + schema_config, + tokens: Some(Vec::new()), + })) + } + + pub fn set_input_file(&mut self, path: &str) -> Result<()> { + self.tokens = Some(Vec::new()); + let buffered_file_stream = Box::new(BufferedFileStream::new(path)?); + self.set_input_stream(buffered_file_stream) + } + + pub fn set_input_stream(&mut self, input_stream: Box) -> Result<()> { + self.lexer.set_input_stream(input_stream); + Ok(()) + } + + pub fn parse_next_log_event(&mut self) -> Result> { + loop { + match self.lexer.get_next_token()? { + Some(token) => match token.get_token_type() { + TokenType::Timestamp(_) => { + if self.tokens.is_none() { + self.buffer_token(token); + continue; + } + let log_event = self.emit_buffered_tokens_as_log_event()?; + self.buffer_token(token); + return Ok(log_event); + } + _ => self.buffer_token(token), + }, + None => break, + } + } + self.emit_buffered_tokens_as_log_event() + } + + fn buffer_token(&mut self, token: Token) { + if self.tokens.is_none() { + self.tokens = Some(Vec::new()); + } + self.tokens.as_mut().unwrap().push(token); + } + + fn emit_buffered_tokens_as_log_event(&mut self) -> Result> { + match &self.tokens { + Some(_) => { + let tokens = self.tokens.take().unwrap(); + LogEvent::new(self.schema_config.clone(), tokens) + } + None => Ok(None), + } + } +} + +impl LogEvent { + fn new(schema_config: Rc, tokens: Vec) -> Result> { + if tokens.is_empty() { + return Err(LogParserInternalErr("The given token vector is empty")); + } + let has_timestamp = match tokens.first().unwrap().get_token_type() { + TokenType::Timestamp(_) => true, + _ => false, + }; + let line_range = ( + tokens.first().unwrap().get_line_num(), + tokens.last().unwrap().get_line_num(), + ); + Ok(Some( + (Self { + tokens, + line_range, + has_timestamp, + schema_config, + }), + )) + } + + pub fn get_timestamp_token(&self) -> Option<&Token> { + match self.has_timestamp { + true => Some(&self.tokens[0]), + false => None, + } + } + + pub fn get_line_range(&self) -> (usize, usize) { + self.line_range + } + + pub fn get_log_message_tokens(&self) -> &[Token] { + match self.has_timestamp { + true => &self.tokens[1..], + false => &self.tokens[..], + } + } +} + +impl Debug for LogEvent { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut result = String::new(); + match self.get_timestamp_token() { + Some(ts_token) => result += format!("Timestamp:\n\t{:?}\n", ts_token).as_str(), + None => result += "Timestamp:\n\tNONE\n", + } + + let (mut curr_line_num, _) = self.get_line_range(); + result += format!("Line {}:\n", curr_line_num).as_str(); + for token in self.get_log_message_tokens() { + if token.get_line_num() != curr_line_num { + curr_line_num = token.get_line_num(); + result += format!("Line {}:\n", curr_line_num).as_str(); + } + result += format!("\t{:?}\n", token).as_str(); + } + + write!(f, "{}", result) + } +} diff --git a/src/log_parser/mod.rs b/src/log_parser/mod.rs new file mode 100644 index 0000000..08947fb --- /dev/null +++ b/src/log_parser/mod.rs @@ -0,0 +1,4 @@ +mod log_parser; + +pub use log_parser::LogParser; +pub use log_parser::LogEvent; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index dec559b..4dfa70a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,6 +1,7 @@ pub(crate) mod regex_parser; mod schema_parser; + pub use schema_parser::parser::SchemaConfig; pub use schema_parser::parser::TimestampSchema; pub use schema_parser::parser::VarSchema; diff --git a/tests/lexer_test.rs b/tests/lexer_test.rs index 561bede..f0232e4 100644 --- a/tests/lexer_test.rs +++ b/tests/lexer_test.rs @@ -3,6 +3,7 @@ use log_surgeon::lexer::BufferedFileStream; use log_surgeon::lexer::Lexer; use log_surgeon::parser::SchemaConfig; +use std::rc::Rc; use std::fs::File; use std::io::{self, BufRead}; @@ -17,8 +18,8 @@ fn test_lexer_simple() -> Result<()> { .join("logs") .join("simple.log"); - let parsed_schema = SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?; - let mut lexer = Lexer::new(&parsed_schema)?; + let parsed_schema = Rc::new(SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?); + let mut lexer = Lexer::new(parsed_schema)?; let buffered_file_stream = Box::new(BufferedFileStream::new(log_path.to_str().unwrap())?); lexer.set_input_stream(buffered_file_stream); @@ -32,12 +33,12 @@ fn test_lexer_simple() -> Result<()> { let mut parsed_line = String::new(); let mut curr_line_num = 0usize; for token in &tokens { - if curr_line_num != token.line_num { + if curr_line_num != token.get_line_num() { parsed_lines.push(parsed_line.clone()); parsed_line.clear(); curr_line_num += 1; } - parsed_line += &token.val.to_string(); + parsed_line += &token.get_val().to_string(); } parsed_lines.push(parsed_line.clone());