Skip to content

Commit

Permalink
Implement log parser
Browse files Browse the repository at this point in the history
  • Loading branch information
LinZhihao-723 committed Dec 14, 2024
1 parent 299c5ee commit 8452f2d
Show file tree
Hide file tree
Showing 14 changed files with 247 additions and 38 deletions.
7 changes: 7 additions & 0 deletions examples/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions examples/logs/simple.log
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
This log event doesn't have a timestamp
2015-01-31T15:50:45.392 Id: 3190; This is a
TIMESTAMP Id: 3190; This is a
multi-line log event with unicode: 这是一个有多行的日志
2015-01-31T15:50:45.393 Id: 0; This is a multi-line log event. I will pay
TIMESTAMP Id: 0; This is a multi-line log event. I will pay
you 1000 dollars to test this file.
2015-01-31T15:50:45.392 Id: 0; This is a variable=0
2015-01-31T15:50:45.392 Id: 0; But this is:0
2015-01-31T15:50:45.392 Variable with delimiter: a b a b a a a a
TIMESTAMP Id: 0; This is a variable=0
TIMESTAMP Id: 0; But this is:0
TIMESTAMP Variable with delimiter: a b a b a a a a
2 changes: 1 addition & 1 deletion examples/schema.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
timestamp:
# E.g. 2015-01-31T15:50:45.392
- '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}'
- '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}'
# E.g. 2015-01-31T15:50:45,392
- '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2},\d{3}'
# E.g. 2015-01-31 15:50:45
Expand Down
3 changes: 1 addition & 2 deletions examples/schema_simple.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
timestamp:
# E.g. 2015-01-31T15:50:45.392
- '\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}'
- 'TIMESTAMP'

delimiters: " \t\r\n:,!;%"

Expand Down
23 changes: 9 additions & 14 deletions examples/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use log_surgeon::error_handling::Result;
use log_surgeon::lexer::BufferedFileStream;
use log_surgeon::lexer::Lexer;
use log_surgeon::parser::SchemaConfig;
use log_surgeon::log_parser::LogEvent;
use log_surgeon::log_parser::LogParser;

use std::rc::Rc;

fn main() -> Result<()> {
let project_root = env!("CARGO_MANIFEST_DIR");
Expand All @@ -10,19 +12,12 @@ fn main() -> Result<()> {
.join("logs")
.join("simple.log");

let parsed_schema = SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?;
let mut lexer = Lexer::new(&parsed_schema)?;
let buffered_file_stream = Box::new(BufferedFileStream::new(log_path.to_str().unwrap())?);
lexer.set_input_stream(buffered_file_stream);

let mut tokens = Vec::new();
while let Some(token) = lexer.get_next_token()? {
tokens.push(token);
}
assert_eq!(false, tokens.is_empty());
let parsed_schema = Rc::new(SchemaConfig::parse_from_file(schema_path.to_str().unwrap())?);
let mut log_parser = LogParser::new(parsed_schema.clone())?;
log_parser.set_input_file(log_path.to_str().unwrap())?;

for token in &tokens {
println!("{:?}", token);
while let Some(log_event) = log_parser.parse_next_log_event()? {
println!("{:?}", log_event);
}

Ok(())
Expand Down
34 changes: 34 additions & 0 deletions src/dfa/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -727,4 +727,38 @@ mod tests {

Ok(())
}

#[test]
fn test_timestamp() -> Result<()> {
let mut parser = RegexParser::new();
let parsed_ast = parser.parse_into_ast(r"\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}")?;

let mut nfa = NFA::new();
nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?;
println!("{:?}", nfa);

let dfa = DFA::from_multiple_nfas(vec![nfa]);
println!("{:?}", dfa);

assert_eq!(dfa.simulate("2015-01-31T15:50:45.39"), (Some(0usize), true));

Ok(())
}

#[test]
fn test_static_text() -> Result<()> {
let mut parser = RegexParser::new();
let parsed_ast = parser.parse_into_ast(r"TIMESTAMP")?;

let mut nfa = NFA::new();
nfa.add_ast_to_nfa(&parsed_ast, NFA::START_STATE, NFA::ACCEPT_STATE)?;
println!("{:?}", nfa);

let dfa = DFA::from_multiple_nfas(vec![nfa]);
println!("{:?}", dfa);

assert_eq!(dfa.simulate("TIMESTAMP"), (Some(0usize), true));

Ok(())
}
}
1 change: 1 addition & 0 deletions src/error_handling/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub enum Error {
LexerInputStreamNotSet,
LexerStateUnknown,
LexerInternalErr(&'static str),
LogParserInternalErr(&'static str),
InvalidSchema,
}

Expand Down
43 changes: 31 additions & 12 deletions src/lexer/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::lexer::LexerStream;
use crate::nfa::nfa::NFA;
use crate::parser::SchemaConfig;
use std::collections::VecDeque;
use std::ffi::c_int;
use std::fmt::Debug;
use std::rc::Rc;

enum LexerState {
Expand All @@ -18,8 +18,8 @@ enum LexerState {
EndOfStream,
}

pub struct Lexer<'a> {
schema_config: &'a SchemaConfig,
pub struct Lexer {
schema_config: Rc<SchemaConfig>,
ts_dfa: DFA,
var_dfa: DFA,

Expand All @@ -38,26 +38,45 @@ pub struct Lexer<'a> {
line_num: usize,
}

#[derive(Debug)]
enum TokenType {
#[derive(Clone, Debug)]
pub enum TokenType {
Timestamp(usize),
Variable(usize),
StaticText,
StaticTextWithEndLine,
End,
}

#[derive(Debug)]
pub struct Token {
pub val: String,
pub token_type: TokenType,
pub line_num: usize,
val: String,
token_type: TokenType,
line_num: usize,
}

impl Debug for Token {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "[{:?}|{}]: \"{}\"", self.token_type, self.line_num, self.val.escape_default())
}
}

impl Token {
pub fn get_val(&self) -> &str {
self.val.as_str()
}

pub fn get_token_type(&self) -> TokenType {
self.token_type.clone()
}

pub fn get_line_num(&self) -> usize {
self.line_num
}
}

impl<'a> Lexer<'a> {
impl Lexer {
const MIN_BUF_GARBAGE_COLLECTION_SIZE: usize = 4096;

pub fn new(schema_mgr: &'a SchemaConfig) -> Result<Self> {
pub fn new(schema_mgr: Rc<SchemaConfig>) -> Result<Self> {
let mut ts_nfas: Vec<NFA> = Vec::new();
for schema in schema_mgr.get_ts_schemas() {
let mut nfa = NFA::new();
Expand Down Expand Up @@ -270,7 +289,7 @@ impl<'a> Lexer<'a> {
}
curr_dfa_state = optional_next_state.unwrap();

match self.ts_dfa.is_accept_state(self.dfa_state.clone()) {
match self.ts_dfa.is_accept_state(curr_dfa_state.clone()) {
Some(ts_schema_id) => last_matched = Some((ts_schema_id, self.buf_cursor_pos)),
None => {}
}
Expand Down
2 changes: 2 additions & 0 deletions src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@ mod lexer_stream;
mod streams;

pub use lexer::Lexer;
pub use lexer::Token;
pub use lexer::TokenType;
pub use lexer_stream::LexerStream;
pub use streams::BufferedFileStream;
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod dfa;
pub mod error_handling;
pub mod lexer;
pub mod log_parser;
mod nfa;
pub mod parser;

Expand Down
145 changes: 145 additions & 0 deletions src/log_parser/log_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
use std::fmt::Debug;
use crate::error_handling::Error::LogParserInternalErr;
use crate::error_handling::Result;
use crate::lexer::BufferedFileStream;
use crate::lexer::LexerStream;
use crate::lexer::{Lexer, Token, TokenType};
use crate::parser::SchemaConfig;
use std::rc::Rc;

pub struct LogParser {
lexer: Lexer,
schema_config: Rc<SchemaConfig>,
tokens: Option<Vec<Token>>,
}

pub struct LogEvent {
tokens: Vec<Token>,
line_range: (usize, usize),
has_timestamp: bool,
schema_config: Rc<SchemaConfig>,
}

impl LogParser {
pub fn new(schema_config: Rc<SchemaConfig>) -> Result<Self> {
let lexer = Lexer::new(schema_config.clone())?;
Ok((Self {
lexer,
schema_config,
tokens: Some(Vec::new()),
}))
}

pub fn set_input_file(&mut self, path: &str) -> Result<()> {
self.tokens = Some(Vec::new());
let buffered_file_stream = Box::new(BufferedFileStream::new(path)?);
self.set_input_stream(buffered_file_stream)
}

pub fn set_input_stream(&mut self, input_stream: Box<dyn LexerStream>) -> Result<()> {
self.lexer.set_input_stream(input_stream);
Ok(())
}

pub fn parse_next_log_event(&mut self) -> Result<Option<LogEvent>> {
loop {
match self.lexer.get_next_token()? {
Some(token) => match token.get_token_type() {
TokenType::Timestamp(_) => {
if self.tokens.is_none() {
self.buffer_token(token);
continue;
}
let log_event = self.emit_buffered_tokens_as_log_event()?;
self.buffer_token(token);
return Ok(log_event);
}
_ => self.buffer_token(token),
},
None => break,
}
}
self.emit_buffered_tokens_as_log_event()
}

fn buffer_token(&mut self, token: Token) {
if self.tokens.is_none() {
self.tokens = Some(Vec::new());
}
self.tokens.as_mut().unwrap().push(token);
}

fn emit_buffered_tokens_as_log_event(&mut self) -> Result<Option<LogEvent>> {
match &self.tokens {
Some(_) => {
let tokens = self.tokens.take().unwrap();
LogEvent::new(self.schema_config.clone(), tokens)
}
None => Ok(None),
}
}
}

impl LogEvent {
fn new(schema_config: Rc<SchemaConfig>, tokens: Vec<Token>) -> Result<Option<Self>> {
if tokens.is_empty() {
return Err(LogParserInternalErr("The given token vector is empty"));
}
let has_timestamp = match tokens.first().unwrap().get_token_type() {
TokenType::Timestamp(_) => true,
_ => false,
};
let line_range = (
tokens.first().unwrap().get_line_num(),
tokens.last().unwrap().get_line_num(),
);
Ok(Some(
(Self {
tokens,
line_range,
has_timestamp,
schema_config,
}),
))
}

pub fn get_timestamp_token(&self) -> Option<&Token> {
match self.has_timestamp {
true => Some(&self.tokens[0]),
false => None,
}
}

pub fn get_line_range(&self) -> (usize, usize) {
self.line_range
}

pub fn get_log_message_tokens(&self) -> &[Token] {
match self.has_timestamp {
true => &self.tokens[1..],
false => &self.tokens[..],
}
}
}

impl Debug for LogEvent {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let mut result = String::new();
match self.get_timestamp_token() {
Some(ts_token) => result += format!("Timestamp:\n\t{:?}\n", ts_token).as_str(),
None => result += "Timestamp:\n\tNONE\n",
}

let (mut curr_line_num, _) = self.get_line_range();
result += format!("Line {}:\n", curr_line_num).as_str();
for token in self.get_log_message_tokens() {
if token.get_line_num() != curr_line_num {
curr_line_num = token.get_line_num();
result += format!("Line {}:\n", curr_line_num).as_str();
}
result += format!("\t{:?}\n", token).as_str();
}

write!(f, "{}", result)
}
}
4 changes: 4 additions & 0 deletions src/log_parser/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
mod log_parser;

pub use log_parser::LogParser;
pub use log_parser::LogEvent;
1 change: 1 addition & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub(crate) mod regex_parser;

mod schema_parser;

pub use schema_parser::parser::SchemaConfig;
pub use schema_parser::parser::TimestampSchema;
pub use schema_parser::parser::VarSchema;
Loading

0 comments on commit 8452f2d

Please sign in to comment.