From 0434976f939604e4b663b9dafde29ea6294b5cca Mon Sep 17 00:00:00 2001 From: Artemis Rosman Date: Fri, 10 May 2024 19:34:41 +1000 Subject: [PATCH] Implement parsing to tokens --- src/main.rs | 45 ++++++++---------------------- src/parse.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/token.rs | 9 ++++++ 3 files changed, 97 insertions(+), 34 deletions(-) create mode 100644 src/parse.rs create mode 100644 src/token.rs diff --git a/src/main.rs b/src/main.rs index 5d2db7c..3685e92 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,15 @@ #![allow(unused)] // Remove later -use std::{ - fs::File, - io::{BufRead, BufReader}, -}; - use clap::{Parser, Subcommand}; use colored::Colorize; +use parse::FileAndPath; mod command; +mod ops; +mod parse; mod state; +mod symbol; +mod token; /// Lace is a complete compiler and interpreter toolchain for the LC3 assembly language. #[derive(Parser)] @@ -76,35 +76,12 @@ fn main() { match command { Subcommands::Run { os, name } => todo!(), Subcommands::Compile { name, dest } => { - // Parse file into a buffer and symbol table - let file = File::open(&name).unwrap_or_else(|err| { - eprintln!( - "Failed to open file with path {}: {}", - name.bold(), - err.to_string().red() - ); - std::process::exit(1) - }); - - // Process lines and check for wrong file type - let lines = BufReader::new(file) - .lines() - .enumerate() - .map(|(i, line)| { - line.unwrap_or_else(|err| { - eprintln!("Failed to read line {}: {}", i, err.to_string().red()); - eprintln!(" --> {}:{}", name, i); - eprintln!( - "Check that you are providing a valid {} file.", - ".asm".bold() - ); - std::process::exit(1) - }) - }) - .collect::>(); - - // Start parsing lines into symbol table and IR - todo!() + let toks = FileAndPath::open(&name).tokenize_asm(); + for line in toks { + println!("\nFirst token: {}", line[0].val); + if line.len() < 2 {continue}; + println!("Second token: {} line {} column {}", line[1].val, line[1].line, line[1].col); + } } Subcommands::Clean { name } => todo!(), Subcommands::Watch { name } => todo!(), diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..0844d13 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,77 @@ +use crate::token::Token; +use colored::Colorize; +use std::{ + fs::File, + io::{BufRead, BufReader, Read}, + path::PathBuf, +}; + +pub struct FileAndPath { + file: File, + path: PathBuf, +} + +impl FileAndPath { + pub fn open(path: &str) -> FileAndPath { + let path = PathBuf::from(&path); + let file = File::open(&path).unwrap_or_else(|err| { + eprintln!( + "Failed to open file with path {}: {}", + path.display(), + err.to_string().red() + ); + std::process::exit(1) + }); + + FileAndPath { file, path } + } + + pub fn tokenize_asm(&self) -> Vec> { + // Process lines and check for wrong file type + let contents = BufReader::new(&self.file) + .lines() + .enumerate() + .map(|(i, line)| { + line.unwrap_or_else(|err| { + eprintln!("Failed to read line {}: {}", i, err.to_string().red()); + eprintln!(" --> {}:{}", self.path.display(), i); + eprintln!( + "Check that you are providing a valid {} file.", + ".asm".bold() + ); + std::process::exit(1) + }) + }) + .collect::>(); + + // Turn lines into a vector that contains a list of tokens for each line + contents + .iter() + .enumerate() + .map(|(i, line)| { + // Get line without comment & whitespace + println!("{}", line); + let sc_idx = if let Some(idx) = line.find(';') { + idx + } else { + line.len() + }; + let clean_str = &line[..sc_idx]; + + // Split on commas and spaces -> vec + let arr_list: Vec = clean_str + .split(|c| c == ' ' || c == ',') + .filter(|word| !word.is_empty()) + .map(|word| Token { + val: word.into(), + line: (i + 1) as u16, + col: unsafe { word.as_ptr().offset_from(line.as_ptr()) + 1 } as u16, + }) + .collect::>(); + arr_list + }) + // Filter after iteration to preserve correct line numbers in tokens + .filter(|line| !line.is_empty()) + .collect() + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..b3e950d --- /dev/null +++ b/src/token.rs @@ -0,0 +1,9 @@ +/// Represents a single "word" inside the parsed representation of source code. +pub struct Token { + // Value contained inside the token + pub val: String, + // Line number inside the file + pub line: u16, + // Column number inside line + pub col: u16, +}