Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Lexer and LogParser for tokenizing log events and structure parsed tokens from a given log stream. #11

Merged
merged 19 commits into from
Dec 15, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add meaningful unit test
LinZhihao-723 committed Dec 14, 2024
commit c26017d1c981d65287543e11d83e58251539c5da
30 changes: 26 additions & 4 deletions src/lexer/lexer.rs
Original file line number Diff line number Diff line change
@@ -49,12 +49,14 @@ enum TokenType {

#[derive(Debug)]
pub struct Token {
val: String,
token_type: TokenType,
line_num: usize,
pub val: String,
pub token_type: TokenType,
pub line_num: usize,
}

impl<'a> Lexer<'a> {
const MIN_BUF_GARBAGE_COLLECTION_SIZE: usize = 4096;

pub fn new(schema_mgr: &'a ParsedSchema) -> Result<Self> {
let mut ts_nfas: Vec<NFA> = Vec::new();
for schema in schema_mgr.get_ts_schemas() {
@@ -296,11 +298,11 @@ impl<'a> Lexer<'a> {
}

if false == self.token_queue.is_empty() {
// TODO: Add garbage collection
break;
}
}

self.buffer_garbage_collection();
Ok(())
}

@@ -373,4 +375,24 @@ impl<'a> Lexer<'a> {
fn increment_buffer_cursor_pos(&mut self) {
self.buf_cursor_pos += 1
}

fn buffer_garbage_collection(&mut self) {
if self.last_tokenized_pos <= self.buf.len() / 2
|| self.last_tokenized_pos <= Self::MIN_BUF_GARBAGE_COLLECTION_SIZE
{
return;
}

let mut dst_idx = 0usize;
let mut src_idx = self.last_tokenized_pos;
while src_idx < self.buf.len() {
self.buf[dst_idx] = self.buf[src_idx];
dst_idx += 1;
src_idx += 1;
}
self.buf.resize(dst_idx, 0 as char);
self.buf_cursor_pos -= self.last_tokenized_pos;
self.last_tokenized_pos = 0;
// No need to reset match_start/end
}
}
26 changes: 24 additions & 2 deletions tests/lexer_test.rs
Original file line number Diff line number Diff line change
@@ -3,6 +3,9 @@ use log_surgeon::lexer::BufferedFileStream;
use log_surgeon::lexer::Lexer;
use log_surgeon::parser::ParsedSchema;

use std::fs::File;
use std::io::{self, BufRead};

#[test]
fn test_lexer_simple() -> Result<()> {
let project_root = env!("CARGO_MANIFEST_DIR");
@@ -25,10 +28,29 @@ fn test_lexer_simple() -> Result<()> {
}
assert_eq!(false, tokens.is_empty());

let mut parsed_lines = Vec::new();
let mut parsed_line = String::new();
let mut curr_line_num = 0usize;
for token in &tokens {
// TODO: Add meaningful assertion when DFA bug is fixed
println!("{:?}", token);
if curr_line_num != token.line_num {
parsed_lines.push(parsed_line.clone());
parsed_line.clear();
curr_line_num += 1;
}
parsed_line += &token.val.to_string();
}
parsed_lines.push(parsed_line.clone());

let mut expected_lines = Vec::new();
let reader = io::BufReader::new(File::open(log_path).expect("failed to open log file"));
for line in reader.lines() {
let line = line.expect("failed to read line");
expected_lines.push(line + "\n");
}

assert_eq!(parsed_lines.len(), expected_lines.len());
assert_eq!(false, parsed_line.is_empty());
assert_eq!(parsed_lines, expected_lines);

Ok(())
}