From 52e7511c8f6c6207878cc90e4dc18003f413a8dd Mon Sep 17 00:00:00 2001 From: Jacob Johannsen Date: Wed, 20 Sep 2023 19:22:46 +0200 Subject: [PATCH] Lexer, parser and AST done. --- Cargo.lock | 232 +++++- Cargo.toml | 1 + hir-parser/Cargo.toml | 26 + hir-parser/README.md | 39 ++ hir-parser/build.rs | 5 + hir-parser/src/ast/block.rs | 92 +++ hir-parser/src/ast/functions.rs | 217 ++++++ hir-parser/src/ast/globals.rs | 89 +++ hir-parser/src/ast/instruction.rs | 609 ++++++++++++++++ hir-parser/src/ast/mod.rs | 146 ++++ hir-parser/src/ast/types.rs | 90 +++ hir-parser/src/lexer/mod.rs | 973 ++++++++++++++++++++++++++ hir-parser/src/lib.rs | 107 +++ hir-parser/src/parser/grammar.lalrpop | 757 ++++++++++++++++++++ hir-parser/src/parser/mod.rs | 260 +++++++ hir-parser/src/symbols.rs | 163 +++++ hir/src/write.rs | 21 +- 17 files changed, 3812 insertions(+), 15 deletions(-) create mode 100644 hir-parser/Cargo.toml create mode 100644 hir-parser/README.md create mode 100644 hir-parser/build.rs create mode 100644 hir-parser/src/ast/block.rs create mode 100644 hir-parser/src/ast/functions.rs create mode 100644 hir-parser/src/ast/globals.rs create mode 100644 hir-parser/src/ast/instruction.rs create mode 100644 hir-parser/src/ast/mod.rs create mode 100644 hir-parser/src/ast/types.rs create mode 100644 hir-parser/src/lexer/mod.rs create mode 100644 hir-parser/src/lib.rs create mode 100644 hir-parser/src/parser/grammar.lalrpop create mode 100644 hir-parser/src/parser/mod.rs create mode 100644 hir-parser/src/symbols.rs diff --git a/Cargo.lock b/Cargo.lock index 77f9d00b0..70d14101a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,6 +100,15 @@ version = "1.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" +[[package]] +name = "ascii-canvas" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8824ecca2e851cec16968d54a01dd372ef8f95b244fb84b84e70128be347c3c6" +dependencies = [ + "term", +] + [[package]] name = "atty" version = "0.2.14" @@ -132,6 +141,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -254,6 +278,18 @@ version = "0.95.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40099d38061b37e505e63f89bab52199037a72b931ad4868d9089ff7268660b0" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "dirs-next" version = "2.0.0" @@ -281,6 +317,15 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "ena" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c533630cf40e9caa44bd91aadc88a75d75a4c3a12b4cfde353cbed41daa1e1f1" +dependencies = [ + "log", +] + [[package]] name = "env_logger" version = "0.9.3" @@ -346,6 +391,12 @@ dependencies = [ "lit", ] +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flurry" version = "0.4.0" @@ -443,6 +494,17 @@ dependencies = [ "memoffset", ] +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi 0.3.2", + "rustix", + "windows-sys", +] + [[package]] name = "itertools" version = "0.10.5" @@ -452,6 +514,34 @@ dependencies = [ "either", ] +[[package]] +name = "lalrpop" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da4081d44f4611b66c6dd725e6de3169f9f63905421e8626fcb86b6a898998b8" +dependencies = [ + "ascii-canvas", + "bit-set", + "diff", + "ena", + "is-terminal", + "itertools", + "lalrpop-util", + "petgraph", + "regex", + "regex-syntax", + "string_cache", + "term", + "tiny-keccak", + "unicode-xid", +] + +[[package]] +name = "lalrpop-util" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f35c735096c0293d313e8f2a641627472b83d01b937177fe76e5e2708d31e0d" + [[package]] name = "lazy_static" version = "1.4.0" @@ -517,6 +607,22 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miden-diagnostics" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f3a82597c2a9babcff4c9283a95130a96aaf8e339954a083bb6582fc2520cf1" +dependencies = [ + "atty", + "codespan", + "codespan-reporting", + "flurry", + "miden-diagnostics-macros 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "parking_lot", + "rustc-hash", + "unicode-width", +] + [[package]] name = "miden-diagnostics" version = "0.1.0" @@ -526,12 +632,23 @@ dependencies = [ "codespan", "codespan-reporting", "flurry", - "miden-diagnostics-macros", + "miden-diagnostics-macros 0.1.0 (git+https://github.com/0xpolygonmiden/miden-diagnostics)", "parking_lot", "rustc-hash", "unicode-width", ] +[[package]] +name = "miden-diagnostics-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "491d10b0eb201ba767ccdf69bf77f9d5662caf55e9ef468264cccb7129edff62" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "miden-diagnostics-macros" version = "0.1.0" @@ -549,7 +666,7 @@ dependencies = [ "anyhow", "cranelift-entity", "intrusive-collections", - "miden-diagnostics", + "miden-diagnostics 0.1.0 (git+https://github.com/0xpolygonmiden/miden-diagnostics)", "miden-hir-symbol", "miden-hir-type", "paste", @@ -567,7 +684,7 @@ dependencies = [ "anyhow", "cranelift-bforest", "cranelift-entity", - "miden-diagnostics", + "miden-diagnostics 0.1.0 (git+https://github.com/0xpolygonmiden/miden-diagnostics)", "miden-hir", "rustc-hash", "smallvec", @@ -593,7 +710,7 @@ version = "0.1.0" dependencies = [ "anyhow", "cranelift-entity", - "miden-diagnostics", + "miden-diagnostics 0.1.0 (git+https://github.com/0xpolygonmiden/miden-diagnostics)", "miden-hir", "miden-hir-analysis", "miden-hir-pass", @@ -605,6 +722,30 @@ dependencies = [ name = "miden-hir-type" version = "0.1.0" +[[package]] +name = "miden-ir-parser" +version = "0.4.0" +dependencies = [ + "lalrpop", + "lalrpop-util", + "lazy_static", + "miden-diagnostics 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "miden-parsing", + "pretty_assertions", + "regex", + "thiserror", +] + +[[package]] +name = "miden-parsing" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d36dfec2c0319b3773a83627318f92a0212077bed80148c86e8b09af60cd1a88" +dependencies = [ + "miden-diagnostics 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "thiserror", +] + [[package]] name = "midenc" version = "0.1.0" @@ -615,7 +756,7 @@ dependencies = [ "env_logger", "human-panic", "log", - "miden-diagnostics", + "miden-diagnostics 0.1.0 (git+https://github.com/0xpolygonmiden/miden-diagnostics)", ] [[package]] @@ -627,6 +768,12 @@ dependencies = [ "adler", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "num_cpus" version = "1.16.0" @@ -692,6 +839,41 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + [[package]] name = "proc-macro2" version = "1.0.66" @@ -853,12 +1035,31 @@ dependencies = [ "serde", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "smallvec" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", +] + [[package]] name = "strsim" version = "0.8.0" @@ -955,6 +1156,15 @@ dependencies = [ "syn 2.0.31", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "toml" version = "0.7.6" @@ -1007,6 +1217,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "utf8parse" version = "0.2.1" @@ -1170,3 +1386,9 @@ name = "winter-utils" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b903fda6a50cce2aa5a172a9269aca0f09b25df20afb1faa427db76d40779671" + +[[package]] +name = "yansi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" diff --git a/Cargo.toml b/Cargo.toml index 2b3697dd7..14d121082 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "hir", + "hir-parser", "hir-analysis", "hir-pass", "hir-symbol", diff --git a/hir-parser/Cargo.toml b/hir-parser/Cargo.toml new file mode 100644 index 000000000..65e5e6e06 --- /dev/null +++ b/hir-parser/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "miden-ir-parser" +version = "0.4.0" +description = "Parser for the Miden IR language" +authors = ["miden contributors"] +readme = "README.md" +license = "MIT" +repository = "https://github.com/0xPolygonMiden/miden-ir" +categories = ["Compilers"] +keywords = ["compiler", "miden"] +edition = "2021" +rust-version = "1.67" + +[build-dependencies] +lalrpop = { version = "0.20", default-features = false } + +[dependencies] +miden-diagnostics = "0.1" +miden-parsing = "0.1" +lalrpop-util="0.20" +lazy_static = "1.4" +regex = "1" +thiserror = "1.0" + +[dev-dependencies] +pretty_assertions = "1.0" diff --git a/hir-parser/README.md b/hir-parser/README.md new file mode 100644 index 000000000..a6d7558cc --- /dev/null +++ b/hir-parser/README.md @@ -0,0 +1,39 @@ +# Parser + +This crate contains the parser for Miden IR. + +The purpose of the parser is to parse the Miden IR language into an Abstract Syntax Tree. + +## Generating the Abstract Syntax Tree (AST) + +The parser uses [Logos](https://github.com/maciejhirsz/logos/) to generate a custom lexer, which is then fed into the parser generated by [LALRPOP](https://github.com/lalrpop/lalrpop/). + +To create an AST from a given Miden IR program or module, pass your source to the public `parse` function, which will return the AST or an `Error` of type `ScanError` or `ParseError`. + +The `parse` function will first tokenize the source using the lexer, then map the resulting tokens to new tokens accepted by the parser, which are of type `(usize, Token, usize)`. Each invalid token will be stored as `ScanError`. Finally, if no `ScanError` occurred, `parse` feeds the tokens to the parser to generate a Result with the corresponding AST (or `ParseError`). + +Example usage: + +```Rust +// parse the source string to a Result containing the AST or an Error +let ast = parse(source.as_str()); +``` + +## AST + +The Miden IR AST (`Source`) contains a vector of `SourceSection`, each of which contains the result of parsing a section in a Miden IR program or module. + + + +TODO: + +The `SourceSection` types are: + +- `AirDef`, which holds the name of the AIR. +- `Constant`, which holds a named constant to be used to write constraints. +- `Trace`, which contains the parsed trace column information for the main and auxiliary execution traces. Each column or group of columns is represented by its identifier and can be accessed in constraints using this identifier. These columns can also be accessed using the built-in variables `$main[idx]` and `$aux[idx]`, where `idx` is the index of the column in the trace. +- `PublicInputs`, which is a vector of all of the public inputs defined in the module. Each public input is represented by its identifier and a fixed size. +- `PeriodicColumns`, which is a vector of all of the periodic columns defined in the module. Each periodic column is represented by its identifier and a vector containing the pattern of its repeated (periodic) values. +- `RandomValues`, which is a vector of all of the random values provided by the verifier. Each random value or group of random values is represented by its identifier and can be accessed in constraints using this identifier. These random values can also be accessed using `$rand[idx]`, where `rand` is the name of the random values array and `idx` is the index of the random value in that array. +- `BoundaryConstraints`, which contains a vector of `BoundaryStmt` statements, each of which can be either a boundary constraint or an intermediate variable. Each boundary constraint is represented as an expression tree. Variables can be scalars, vectors or matrices containing expression trees. +- `IntegrityConstraints`, which contains a vector of `IntegrityStmt` statements, each of which can be either an integrity constraint or an intermediate variable. Each integrity constraint is represented as an expression tree. Variables can be scalars, vectors or matrices containing expression trees. \ No newline at end of file diff --git a/hir-parser/build.rs b/hir-parser/build.rs new file mode 100644 index 000000000..23c7d3f80 --- /dev/null +++ b/hir-parser/build.rs @@ -0,0 +1,5 @@ +extern crate lalrpop; + +fn main() { + lalrpop::process_root().unwrap(); +} diff --git a/hir-parser/src/ast/block.rs b/hir-parser/src/ast/block.rs new file mode 100644 index 000000000..a7da7d592 --- /dev/null +++ b/hir-parser/src/ast/block.rs @@ -0,0 +1,92 @@ +use super::*; + +const INDENT: &str = " "; + +/// Represents the label at the start of a basic block. +/// +/// Labels must be unique within each function. +pub struct Label { + pub name: Identifier, +} +impl Label { + pub fn new(name: Identifier) -> Self { + Self { name } + } +} +impl fmt::Display for Label { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.name) + } +} + +/// Represents an argument for a basic block +pub struct BlockArgument { + pub value: Value, + pub ty: Type, +} +impl BlockArgument { + pub fn new(value: Value, ty: Type) -> Self { + Self { value, ty } + } +} +impl fmt::Display for BlockArgument { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} : {} ", self.value, self.ty) + } +} + +/// Represents the label and the arguments of a basic block +pub struct BlockHeader { + pub label: Label, + pub args: Vec, +} +impl BlockHeader { + pub fn new(label: Label, args: Vec) -> Self { + Self { label, args } + } +} +impl fmt::Display for BlockHeader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} ", self.label)?; + if self.args.len() == 0 { + f.write_str(":\n") + } else { + f.write_str("(")?; + for (i, arg) in self.args.iter().enumerate() { + if i != 0 { + write!(f, ", {}", arg)?; + } else { + write!(f, "{}", arg)?; + } + } + f.write_str(") :\n") + } + } +} + +/// Represents a basic block of instructions +#[derive(Spanned)] +pub struct Block { + #[span] + pub span: SourceSpan, + pub header: BlockHeader, + pub instructions: Vec, +} +impl Block { + pub fn new(span: SourceSpan, header: BlockHeader, instructions: Vec) -> Self { + Self { + span, + header, + instructions, + } + } +} +impl fmt::Display for Block { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "{}", self.header)?; + for inst in self.instructions.iter() { + writeln!(f, "{}{}", INDENT, inst)?; + } + Ok(()) + } +} diff --git a/hir-parser/src/ast/functions.rs b/hir-parser/src/ast/functions.rs new file mode 100644 index 000000000..44df73b97 --- /dev/null +++ b/hir-parser/src/ast/functions.rs @@ -0,0 +1,217 @@ +use super::*; + +/// Represents an identifier that represents a function name. +/// +/// A function identifier is a non-empty sequence of identifiers, separated by double +/// colons ("::"). The last identifier in the sequence denotes the name of the function +/// itself. The other identifiers denote the module that the function can be found in. If +/// the function identifier only consists of a single identifier, then the function must +/// be found in the current module. +#[derive(Spanned)] +pub struct FunctionIdentifier { + #[span] + span: SourceSpan, + names: Vec, +} +impl FunctionIdentifier { + pub fn new(span: SourceSpan, names: Vec) -> Self { + Self { span, names } + } + + pub fn id(&self) -> &Identifier { + self.names.last().unwrap() + } +} +impl fmt::Display for FunctionIdentifier { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for (i, id) in self.names.iter().enumerate() { + if i > 0 { + f.write_str("::")?; + } + write!(f, "{}", id)?; + } + Ok(()) + } +} + +/// The possible visibilities of a function +pub enum Visibility { + /// (Module) private visibility + Private, + /// Public visibility + Public, +} + +/// The possible calling convetions of a function +pub enum CallConvention { + /// Default call convention + Default, + /// Kernel call convention + Kernel, + /// Fast call convention + Fast, +} + +/// The possible purposes of a function parameter +pub enum ParameterPurpose { + /// Standard parameter + Standard, + /// Parameter for struct return + Sret, +} + +/// The possible extensions of a function parameter when filling up a word +pub enum ParameterExtension { + /// No extension + None, + /// 0 extended + Zero, + /// Sign extended + Signed, +} + +/// A single parameter to a function. +/// Parameter names are defined in the entry block for the function. +pub struct FunctionParameter { + /// The purpose of the parameter (default or struct return) + pub purpose: ParameterPurpose, + /// The bit extension for the parameter + pub extension: ParameterExtension, + /// The type of the parameter + pub ty: Type, +} +impl FunctionParameter { + pub fn new(purpose: ParameterPurpose, extension: ParameterExtension, ty: Type) -> Self { + Self { + purpose, + extension, + ty, + } + } +} +impl fmt::Display for FunctionParameter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.purpose { + ParameterPurpose::Standard => Ok(()), + ParameterPurpose::Sret => f.write_str("sret "), + }?; + match self.extension { + ParameterExtension::None => Ok(()), + ParameterExtension::Zero => f.write_str("zext "), + ParameterExtension::Signed => f.write_str("sext "), + }?; + write!(f, "{}", self.ty) + } +} + +/// A single return value from a function. +pub struct FunctionReturn { + /// The bit extension for the parameter + pub extension: ParameterExtension, + /// The type of the parameter + pub ty: Type, +} +impl FunctionReturn { + pub fn new(extension: ParameterExtension, ty: Type) -> Self { + Self { extension, ty } + } +} +impl fmt::Display for FunctionReturn { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.extension { + ParameterExtension::None => Ok(()), + ParameterExtension::Zero => f.write_str("zext "), + ParameterExtension::Signed => f.write_str("sext "), + }?; + write!(f, "{}", self.ty) + } +} + +/// Represents the type signature of a function +#[derive(Spanned)] +pub struct FunctionSignature { + #[span] + pub span: SourceSpan, + pub visibility: Visibility, + pub call_convention: CallConvention, + pub name: FunctionIdentifier, + pub params: Vec, + pub returns: Vec, +} +impl FunctionSignature { + pub fn new( + span: SourceSpan, + visibility: Visibility, + call_convention: CallConvention, + name: FunctionIdentifier, + params: Vec, + returns: Vec, + ) -> Self { + Self { + span, + visibility, + call_convention, + name, + params, + returns, + } + } +} +impl fmt::Display for FunctionSignature { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.visibility { + Visibility::Private => Ok(()), + Visibility::Public => f.write_str("pub "), + }?; + match self.call_convention { + CallConvention::Default => Ok(()), + CallConvention::Kernel => f.write_str("kernel "), + CallConvention::Fast => f.write_str("fast "), + }?; + write!(f, "{}(", self.name)?; + for (i, param) in self.params.iter().enumerate() { + if i != 0 { + write!(f, ", {}", param)?; + } else { + write!(f, "{}", param)?; + } + } + f.write_str(")")?; + for (i, ret) in self.returns.iter().enumerate() { + if i != 0 { + write!(f, ", {}", ret)?; + } else { + write!(f, "{}", ret)?; + } + } + Ok(()) + } +} + +/// Represents the declaration of a function +#[derive(Spanned)] +pub struct FunctionDeclaration { + #[span] + pub span: SourceSpan, + pub signature: FunctionSignature, + pub blocks: Vec, +} +impl FunctionDeclaration { + pub fn new(span: SourceSpan, signature: FunctionSignature, blocks: Vec) -> Self { + Self { + span, + signature, + blocks, + } + } +} +impl fmt::Display for FunctionDeclaration { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.signature)?; + f.write_str("{{\n")?; + for block in self.blocks.iter() { + write!(f, "{}", block)?; + } + f.write_str("}}\n") + } +} diff --git a/hir-parser/src/ast/globals.rs b/hir-parser/src/ast/globals.rs new file mode 100644 index 000000000..b7d9d761b --- /dev/null +++ b/hir-parser/src/ast/globals.rs @@ -0,0 +1,89 @@ +use std::fmt; + +use miden_diagnostics::{SourceSpan, Spanned}; + +use super::*; + +/// This is a type alias used to clarify that an identifier refers to a global variable +pub type GlobalVarId = Identifier; + +/// A constant value in the form of a hexadecimal string +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Constant { + pub data: Vec, +} +impl Constant { + pub fn new(data: Vec) -> Self { + Self { data } + } +} +impl fmt::Display for Constant { + /// Print the constant data in hexadecimal format, e.g. 0x000102030405060708090a0b0c0d0e0f. + /// + /// The printed form of the constant renders the bytes in big-endian order, for readability. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if !self.data.is_empty() { + write!(f, "0x")?; + for b in self.data.iter().rev() { + write!(f, "{:02x}", b)?; + } + } + Ok(()) + } +} + +/// This represents the declaration of a Miden IR global variable +#[derive(Spanned)] +pub struct GlobalVarDeclaration { + #[span] + pub span: SourceSpan, + pub name: GlobalVarId, + pub ty: Type, + pub linkage: Linkage, + pub init: Option, +} +impl GlobalVarDeclaration { + /// Constructs a new global variable, with the given span, name, type, linkage, and optinal initializer. + /// + pub fn new(span: SourceSpan, name: ModuleId, ty: Type, linkage: Linkage) -> Self { + Self { + span: span, + name: name, + ty: ty, + linkage: linkage, + init: None, + } + } + + pub fn with_init(&mut self, init: Constant) { + self.init = Some(init) + } +} +impl fmt::Display for GlobalVarDeclaration { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {} {}", self.name, self.ty, self.linkage)?; + if self.init.is_some() { + write!(f, "= {}", self.init.as_ref().unwrap())?; + } + Ok(()) + } +} +/// Represents the intended linkage for a global variable. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum Linkage { + /// Global linkage + Internal, + /// "One definition rule" linkage + Odr, + /// External linkage + External, +} +impl fmt::Display for Linkage { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Internal => write!(f, "internal"), + Self::Odr => write!(f, "odr"), + Self::External => write!(f, "external"), + } + } +} diff --git a/hir-parser/src/ast/instruction.rs b/hir-parser/src/ast/instruction.rs new file mode 100644 index 000000000..72a837058 --- /dev/null +++ b/hir-parser/src/ast/instruction.rs @@ -0,0 +1,609 @@ +use super::*; + +/// Represents a value in Miden IR. +/// +/// All intermediate values are named, and have an associated [Value]. +/// Value identifiers must be globally unique. +pub struct Value { + pub name: Identifier, +} +impl Value { + pub fn new(name: Identifier) -> Self { + Self { name } + } +} +impl fmt::Display for Value { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.name) + } +} + +/// Immediates are converted at a later stage +pub enum Immediate { + Pos(u128), + Neg(u128), +} +impl fmt::Display for Immediate { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Pos(0) | Self::Neg(0) => f.write_str("0"), + Self::Pos(v) => write!(f, "{}", v), + Self::Neg(v) => write!(f, "-{}", v), + } + } +} + +/// Represents a single instruction. +/// +/// An instruction consists of a single operation, and a number of values that +/// represent the results of the operation. Additionally, the instruction contains +/// the types of the produced results +#[derive(Spanned)] +pub struct Instruction { + #[span] + pub span: SourceSpan, + pub values: Vec, + pub op: Operation, + pub types: Vec, +} +impl Instruction { + pub fn new(span: SourceSpan, values: Vec, op: Operation, types: Vec) -> Self { + Self { + span, + values, + op, + types, + } + } +} +impl fmt::Display for Instruction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.values.len() == 0 { + write!(f, "{}", self.op)?; + } else { + for (i, v) in self.values.iter().enumerate() { + if i != 0 { + write!(f, ", {}", v)?; + } else { + write!(f, "{}", v)?; + } + } + write!(f, " = {} : ", self.op)?; + for (i, t) in self.types.iter().enumerate() { + if i != 0 { + write!(f, ", {}", t)?; + } else { + write!(f, "{}", t)?; + } + } + } + Ok(()) + } +} + +/// Represents a operation and its arguments +pub enum Operation { + BinaryOp(BinaryOpCode, Value, Value), + BinaryImmOp(BinaryImmOpCode, Value, Immediate), + UnaryOp(UnaryOpCode, Value), + UnaryImmOp(UnaryImmOpCode, Immediate), + ReturnOp(Vec), + CallOp(CallOp, FunctionIdentifier, Vec), + CondOp(Value, Destination, Destination), + BranchOp(Destination), + SwitchOp(Value, Vec), + TestOp(Type, Value), + PrimOp(PrimOpCode, Vec), + LoadOp(Value), + MemCpyOp(Type, Value, Value, Value), + GlobalValueOp(GlobalValueOp), +} +impl fmt::Display for Operation { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::BinaryOp(op, v1, v2) => { + write!(f, "{} {} {}", op, v1, v2) + } + Self::BinaryImmOp(op, v, i) => { + write!(f, "{} {} {}", op, v, i) + } + Self::UnaryOp(op, v) => { + write!(f, "{} {}", op, v) + } + Self::UnaryImmOp(op, i) => { + write!(f, "{} {}", op, i) + } + Self::ReturnOp(vs) => { + f.write_str("ret")?; + for (i, v) in vs.iter().enumerate() { + if i > 0 { + f.write_str(",")?; + } + write!(f, " {}", v)?; + } + Ok(()) + } + Self::CallOp(op, id, vs) => { + write!(f, "{} {} (", op, id)?; + for (i, v) in vs.iter().enumerate() { + if i > 0 { + f.write_str(", ")?; + } + write!(f, "{}", v)?; + } + f.write_str(")") + } + Self::CondOp(v, dest1, dest2) => { + write!(f, "cond {}, {}, {}", v, dest1, dest2) + } + Self::BranchOp(dest) => { + write!(f, "branch {}", dest) + } + Self::SwitchOp(v, branches) => { + writeln!(f, "switch {} {{", v)?; + for (i, b) in branches.iter().enumerate() { + if i > 0 { + f.write_str(",\n")?; + } + //TODO: Indentation + write!(f, "{}", b)?; + } + //TODO: Indentation + f.write_str("\n}}") + } + Self::TestOp(t, v) => { + write!(f, "test.{} {}", t, v) + } + Self::PrimOp(op, vs) => { + write!(f, "{}", op)?; + for (i, v) in vs.iter().enumerate() { + if i > 0 { + f.write_str(",")?; + } + write!(f, " {}", v)?; + } + Ok(()) + } + Self::LoadOp(v) => { + write!(f, "load {}", v) + } + Self::MemCpyOp(t, v1, v2, v3) => { + write!(f, "memcpy.{} {}, {}, {}", t, v1, v2, v3) + } + Self::GlobalValueOp(op) => { + write!(f, "{}", op) + } + } + } +} + +/// Used to distinguish between user calls and kernel calls +pub enum CallOp { + Call, + SysCall, +} +impl fmt::Display for CallOp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Call => f.write_str("call"), + Self::SysCall => f.write_str("syscall"), + } + } +} + +/// Used to distinguish between binary operations +pub enum BinaryOpCode { + Add(Overflow), + Sub(Overflow), + Mul(Overflow), + Div(Overflow), + Min(Overflow), + Max(Overflow), + Mod(Overflow), + DivMod(Overflow), + Exp(Overflow), + And, + BAnd(Overflow), + Or, + BOr(Overflow), + Xor, + BXor(Overflow), + Shl(Overflow), + Shr(Overflow), + Rotl(Overflow), + Rotr(Overflow), + Eq, + Neq, + Gt, + Gte, + Lt, + Lte, + Store, +} +impl fmt::Display for BinaryOpCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Add(overflow) => { + write!(f, "add.{}", overflow) + } + Self::Sub(overflow) => { + write!(f, "sub.{}", overflow) + } + Self::Mul(overflow) => { + write!(f, "mul.{}", overflow) + } + Self::Div(overflow) => { + write!(f, "div.{}", overflow) + } + Self::Min(overflow) => { + write!(f, "min.{}", overflow) + } + Self::Max(overflow) => { + write!(f, "max.{}", overflow) + } + Self::Mod(overflow) => { + write!(f, "mod.{}", overflow) + } + Self::DivMod(overflow) => { + write!(f, "divmod.{}", overflow) + } + Self::Exp(overflow) => { + write!(f, "exp.{}", overflow) + } + Self::And => f.write_str("and"), + Self::BAnd(overflow) => { + write!(f, "band.{}", overflow) + } + Self::Or => f.write_str("or"), + Self::BOr(overflow) => { + write!(f, "bor.{}", overflow) + } + Self::Xor => f.write_str("xor"), + Self::BXor(overflow) => { + write!(f, "bxor.{}", overflow) + } + Self::Shl(overflow) => { + write!(f, "shl.{}", overflow) + } + Self::Shr(overflow) => { + write!(f, "shr.{}", overflow) + } + Self::Rotl(overflow) => { + write!(f, "rotl.{}", overflow) + } + Self::Rotr(overflow) => { + write!(f, "rotr.{}", overflow) + } + Self::Eq => f.write_str("eq"), + Self::Neq => f.write_str("neq"), + Self::Gt => f.write_str("gt"), + Self::Gte => f.write_str("gte"), + Self::Lt => f.write_str("lt"), + Self::Lte => f.write_str("lte"), + Self::Store => f.write_str("store"), + } + } +} + +/// Used to distinguish between immediate binary operations +pub enum BinaryImmOpCode { + AddImm(Overflow), + SubImm(Overflow), + MulImm(Overflow), + DivImm(Overflow), + MinImm(Overflow), + MaxImm(Overflow), + ModImm(Overflow), + DivModImm(Overflow), + ExpImm(Overflow), + AndImm, + BAndImm(Overflow), + OrImm, + BOrImm(Overflow), + XorImm, + BXorImm(Overflow), + ShlImm(Overflow), + ShrImm(Overflow), + RotlImm(Overflow), + RotrImm(Overflow), +} +impl fmt::Display for BinaryImmOpCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::AddImm(overflow) => { + write!(f, "add_imm.{}", overflow) + } + Self::SubImm(overflow) => { + write!(f, "sub_imm.{}", overflow) + } + Self::MulImm(overflow) => { + write!(f, "mul_imm.{}", overflow) + } + Self::DivImm(overflow) => { + write!(f, "div_imm.{}", overflow) + } + Self::MinImm(overflow) => { + write!(f, "min_imm.{}", overflow) + } + Self::MaxImm(overflow) => { + write!(f, "max_imm.{}", overflow) + } + Self::ModImm(overflow) => { + write!(f, "mod_imm.{}", overflow) + } + Self::DivModImm(overflow) => { + write!(f, "divmod_imm.{}", overflow) + } + Self::ExpImm(overflow) => { + write!(f, "exp_imm.{}", overflow) + } + Self::AndImm => f.write_str("and"), + Self::BAndImm(overflow) => { + write!(f, "band_imm.{}", overflow) + } + Self::OrImm => f.write_str("or"), + Self::BOrImm(overflow) => { + write!(f, "bor_imm.{}", overflow) + } + Self::XorImm => f.write_str("xor"), + Self::BXorImm(overflow) => { + write!(f, "bxor_imm.{}", overflow) + } + Self::ShlImm(overflow) => { + write!(f, "shl_imm.{}", overflow) + } + Self::ShrImm(overflow) => { + write!(f, "shr_imm.{}", overflow) + } + Self::RotlImm(overflow) => { + write!(f, "rotl_imm.{}", overflow) + } + Self::RotrImm(overflow) => { + write!(f, "rotr_imm.{}", overflow) + } + } + } +} + +/// Used to distinguish between unary operations +pub enum UnaryOpCode { + Inv, + Incr, + Pow2, + Not, + BNot, + PopCnt, + IsOdd, + Cast, + PtrToInt, + IntToPtr, + TruncW, + Zext, + Sext, + Neg, +} +impl fmt::Display for UnaryOpCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Inv => f.write_str("inv"), + Self::Incr => f.write_str("incr"), + Self::Pow2 => f.write_str("pow2"), + Self::Not => f.write_str("not"), + Self::BNot => f.write_str("bnot"), + Self::PopCnt => f.write_str("popcnt"), + Self::IsOdd => f.write_str("is_odd"), + Self::Cast => f.write_str("cast"), + Self::PtrToInt => f.write_str("ptrtoint"), + Self::IntToPtr => f.write_str("inttoptr"), + Self::TruncW => f.write_str("truncw"), + Self::Zext => f.write_str("zext"), + Self::Sext => f.write_str("sext"), + Self::Neg => f.write_str("neg"), + } + } +} + +/// Used to distinguish between immediate unary operations +pub enum UnaryImmOpCode { + I1, + I8, + I16, + I32, + I64, + ISize, + Felt, + F64, +} +impl fmt::Display for UnaryImmOpCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("const.")?; + match self { + Self::I1 => f.write_str("i1"), + Self::I8 => f.write_str("i8"), + Self::I16 => f.write_str("i16"), + Self::I32 => f.write_str("i32"), + Self::I64 => f.write_str("i64"), + Self::ISize => f.write_str("isize"), + Self::Felt => f.write_str("felt"), + Self::F64 => f.write_str("f64"), + } + } +} + +/// Used to distinguish between primary operations +pub enum PrimOpCode { + Select, + Assert, + Assertz, + AssertEq, + Alloca, + Unreachable, +} +impl fmt::Display for PrimOpCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Select => f.write_str("select"), + Self::Assert => f.write_str("assert"), + Self::Assertz => f.write_str("assertz"), + Self::AssertEq => f.write_str("asserteq"), + Self::Alloca => f.write_str("alloca"), + Self::Unreachable => f.write_str("unreachable"), + } + } +} + +/// Memory offset for global variable reads. +/// Conversion to i32 happens during transformation to hir. +pub enum Offset { + Pos(u128), + Neg(u128), +} +impl Offset { + pub fn is_zero(&self) -> bool { + match self { + Self::Pos(offset) | Self::Neg(offset) => offset == &0, + } + } +} +impl fmt::Display for Offset { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Pos(offset) => { + if offset > &0 { + write!(f, "+{}", offset)?; + } + } + Self::Neg(offset) => { + if offset > &0 { + write!(f, "-{}", offset)?; + } + } + } + Ok(()) + } +} + +/// Used to distinguish between nested global value operations +pub enum GlobalValueOpNested { + Symbol(Identifier, Offset), + Load(Box, Offset), + Cast(Box, Offset, Type), +} +impl fmt::Display for GlobalValueOpNested { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Symbol(id, offset) => { + write!(f, "@ {} {}", id, offset) + } + Self::Load(nested, offset) => { + f.write_str("* ")?; + if offset.is_zero() { + write!(f, "{}", nested)?; + } else { + write!(f, "({}){}", nested, offset)?; + } + Ok(()) + } + Self::Cast(nested, offset, ty) => { + write!(f, "* ({}) {} as {}", nested, offset, ty) + } + } + } +} + +/// Used to distinguish between top-level global value operations +pub enum GlobalValueOp { + Symbol(Identifier, Offset), + Load(GlobalValueOpNested, Offset), + Cast(GlobalValueOpNested, Offset, Type), + IAddImm(u128, Type, GlobalValueOpNested), +} +impl fmt::Display for GlobalValueOp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("global.")?; + match self { + Self::Symbol(id, offset) => { + write!(f, "symbol @ {} {}", id, offset) + } + Self::Load(nested, offset) => { + f.write_str("load ")?; + if offset.is_zero() { + write!(f, "{}", nested)?; + } else { + write!(f, "({}) {}", nested, offset)?; + } + Ok(()) + } + Self::Cast(nested, offset, ty) => { + write!(f, "load ({}) {} {}", nested, offset, ty) + } + Self::IAddImm(i, ty, nested) => { + write!(f, "iadd.{}.{} {}", i, ty, nested) + } + } + } +} + +/// Used to distinguish between top-level global value operations +pub enum Overflow { + Checked, + Unchecked, + Overflowing, + Wrapping, +} +impl fmt::Display for Overflow { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Checked => f.write_str("checked"), + Self::Unchecked => f.write_str("unchecked"), + Self::Overflowing => f.write_str("overflowing"), + Self::Wrapping => f.write_str("wrapping"), + } + } +} + +/// The destination of a branch/jump +pub struct Destination { + pub label: Label, + pub args: Vec, +} +impl Destination { + pub fn new(label: Label, args: Vec) -> Self { + Self { label, args } + } +} +impl fmt::Display for Destination { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.label)?; + if self.args.len() > 0 { + f.write_str(" (")?; + for (i, arg) in self.args.iter().enumerate() { + if i > 0 { + write!(f, ", {}", arg)?; + } else { + write!(f, "{}", arg)?; + } + } + f.write_str(")")?; + } + Ok(()) + } +} + +/// A branch of a switch operation +pub enum SwitchBranch { + Test(u128, Label), + Default(Label), +} +impl fmt::Display for SwitchBranch { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Test(test, label) => { + write!(f, "{} => {}", test, label) + } + Self::Default(label) => { + write!(f, "{}", label) + } + } + } +} diff --git a/hir-parser/src/ast/mod.rs b/hir-parser/src/ast/mod.rs new file mode 100644 index 000000000..35160179b --- /dev/null +++ b/hir-parser/src/ast/mod.rs @@ -0,0 +1,146 @@ +mod block; +mod functions; +mod globals; +mod instruction; +mod types; + +pub use self::block::*; +pub use self::functions::*; +pub use self::globals::*; +pub use self::instruction::*; +pub use self::types::*; + +use std::fmt; + +use miden_diagnostics::{SourceSpan, Span, Spanned}; + +use crate::Symbol; + +/// This represents a fully parsed Miden IR program. +#[derive(Spanned)] +pub struct Program { + #[span] + pub span: SourceSpan, + /// The set of modules in the program + pub modules: Vec, + /// The name of the function that acts as the entry point for the program + pub entry_point: Option, + /// The global variables declared in this program + pub global_vars: Vec, +} +impl Program { + /// Creates a new [Program]. + pub fn new(span: SourceSpan, modules: Vec, globals: Vec) -> Self { + Self { + span: span, + modules: modules, + entry_point: None, + global_vars: globals, + } + } + + pub fn with_entry_point(&mut self, name: FunctionIdentifier) { + self.entry_point = Some(name) + } +} +impl fmt::Display for Program { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for module in self.modules.iter() { + writeln!(f, "{}", module)?; + } + writeln!(f)?; + if self.entry_point.is_some() { + writeln!(f, "{}", self.entry_point.as_ref().unwrap())?; + } + for global in self.global_vars.iter() { + writeln!(f, "{}", global)?; + } + Ok(()) + } +} + +/// This is a type alias used to clarify that an identifier refers to a module +pub type ModuleId = Identifier; + +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum ModuleType { + /// Kernel context module + Kernel, + /// User context module + Module, +} +impl fmt::Display for ModuleType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Kernel => f.write_str("kernel"), + Self::Module => f.write_str("module"), + } + } +} + +/// This represents the parsed contents of a single Miden IR module +/// +#[derive(Spanned)] +pub struct Module { + #[span] + pub span: SourceSpan, + pub name: ModuleId, + pub ty: ModuleType, + pub functions: Vec, + pub externals: Vec, +} +impl Module { + /// Constructs a new module of the specified type, with the given span, name, functions and exports (externals). + /// + pub fn new( + span: SourceSpan, + ty: ModuleType, + name: ModuleId, + functions: Vec, + externals: Vec, + ) -> Self { + Self { + span, + name, + ty, + functions, + externals, + } + } +} +impl fmt::Display for Module { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "{} {}", self.ty, self.name)?; + for func in self.functions.iter() { + writeln!(f, "{}", func)?; + } + for ext in self.externals.iter() { + writeln!(f, "{};", ext)?; + } + Ok(()) + } +} + +/// Represents any type of identifier in Miden IR. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Spanned)] +pub struct Identifier(Span); +impl Identifier { + pub fn new(span: SourceSpan, name: Symbol) -> Self { + Self(Span::new(span, name)) + } + + /// Returns the underlying symbol of the identifier. + pub fn name(&self) -> Symbol { + self.0.item + } + + #[inline] + pub fn as_str(&self) -> &str { + self.0.as_str() + } +} +impl fmt::Display for Identifier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", &self.0) + } +} diff --git a/hir-parser/src/ast/types.rs b/hir-parser/src/ast/types.rs new file mode 100644 index 000000000..1c6d6d02c --- /dev/null +++ b/hir-parser/src/ast/types.rs @@ -0,0 +1,90 @@ +use super::*; + +/// The types of values which can be represented in an AirScript program +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Type { + /// The singleton type + Unit, + /// The empty type + Never, + /// The type of a single bit, i.e., the boolean type + I1, + /// Signed 8-bit integers + I8, + /// Unsigned 8-bit integers + U8, + /// Signed 16-bit integers + I16, + /// Unsigned 16-bit integers + U16, + /// Signed 32-bit integers + I32, + /// Unsigned 32-bit integers + U32, + /// Signed 64-bit integers + I64, + /// Unsigned 64-bit integers + U64, + /// Signed 128-bit integers + I128, + /// Unsigned 128-bit integers + U128, + /// Unsigned 256-bit integers + U256, + /// Signed integers of size equal to the native architecture word size + ISize, + /// Unsigned integers of size equal to the native architecture word size + USize, + /// 64-bit floats + F64, + /// Field elements + Felt, + /// Pointers to values of the inner type + Ptr(Box), + /// Native pointers to values of the inner type + NativePtr(Box), + /// Structs containing field values of the specified types in the specified order. + /// The empty struct is a legal type. + Struct(Vec), + /// Arrays of the specified length, containing values of the specified type. + Array(Box, u128), +} +impl fmt::Display for Type { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Unit => f.write_str("()"), + Self::Never => f.write_str("!"), + Self::I1 => f.write_str("i1"), + Self::I8 => f.write_str("i8"), + Self::U8 => f.write_str("u8"), + Self::I16 => f.write_str("i16"), + Self::U16 => f.write_str("u16"), + Self::I32 => f.write_str("i32"), + Self::U32 => f.write_str("u32"), + Self::I64 => f.write_str("i64"), + Self::U64 => f.write_str("u64"), + Self::I128 => f.write_str("i128"), + Self::U128 => f.write_str("u128"), + Self::U256 => f.write_str("u256"), + Self::ISize => f.write_str("isize"), + Self::USize => f.write_str("usize"), + Self::F64 => f.write_str("f64"), + Self::Felt => f.write_str("felt"), + Self::Ptr(inner) => write!(f, "*mut {}", inner), + Self::NativePtr(inner) => write!(f, "&mut {}", inner), + Self::Struct(types) => { + f.write_str("{ ")?; + for (i, t) in types.iter().enumerate() { + if i != 0 { + write!(f, ", {}", t)?; + } else { + write!(f, "{}", t)?; + } + f.write_str(" }")?; + } + Ok(()) + } + Self::Array(inner, length) => write!(f, "[ {} ; {} ]", inner, length), + } + } +} diff --git a/hir-parser/src/lexer/mod.rs b/hir-parser/src/lexer/mod.rs new file mode 100644 index 000000000..aef532376 --- /dev/null +++ b/hir-parser/src/lexer/mod.rs @@ -0,0 +1,973 @@ +#[cfg(test)] +mod tests; + +use core::{fmt, mem, num::IntErrorKind}; + +use miden_diagnostics::{Diagnostic, SourceIndex, SourceSpan, ToDiagnostic}; +use miden_parsing::{Scanner, Source}; + +use crate::{parser::ParseError, Symbol}; + +/// The value produced by the Lexer when iterated +pub type Lexed = Result<(SourceIndex, Token, SourceIndex), ParseError>; + +/// Errors that may occur during lexing of the source +#[derive(Clone, Debug, thiserror::Error)] +pub enum LexicalError { + #[error("invalid integer value: {}", DisplayIntErrorKind(reason))] + InvalidInt { + span: SourceSpan, + reason: IntErrorKind, + }, + #[error("encountered unexpected character '{found}'")] + UnexpectedCharacter { start: SourceIndex, found: char }, +} +impl PartialEq for LexicalError { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::InvalidInt { reason: lhs, .. }, Self::InvalidInt { reason: rhs, .. }) => { + lhs == rhs + } + ( + Self::UnexpectedCharacter { found: lhs, .. }, + Self::UnexpectedCharacter { found: rhs, .. }, + ) => lhs == rhs, + _ => false, + } + } +} +impl ToDiagnostic for LexicalError { + fn to_diagnostic(self) -> Diagnostic { + use miden_diagnostics::Label; + + match self { + Self::InvalidInt { span, ref reason } => Diagnostic::error() + .with_message("invalid integer literal") + .with_labels(vec![Label::primary(span.source_id(), span) + .with_message(format!("{}", DisplayIntErrorKind(reason)))]), + Self::UnexpectedCharacter { start, .. } => Diagnostic::error() + .with_message("unexpected character") + .with_labels(vec![Label::primary( + start.source_id(), + SourceSpan::new(start, start), + )]), + } + } +} + +struct DisplayIntErrorKind<'a>(&'a IntErrorKind); +impl<'a> fmt::Display for DisplayIntErrorKind<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0 { + IntErrorKind::Empty => write!(f, "unable to parse empty string as integer"), + IntErrorKind::InvalidDigit => write!(f, "invalid digit"), + IntErrorKind::PosOverflow => write!(f, "value is too big"), + IntErrorKind::NegOverflow => write!(f, "value is too big"), + IntErrorKind::Zero => write!(f, "zero is not a valid value here"), + other => write!(f, "unable to parse integer value: {:?}", other), + } + } +} + +#[derive(Debug, Clone)] +pub enum Token { + Eof, + Error(LexicalError), + Comment, + // PRIMITIVES + // -------------------------------------------------------------------------------------------- + /// Identifiers should start with alphabet followed by one or more alpha numeric characters + /// or an underscore. + Ident(Symbol), + /// Integers should only contain numeric characters. + Num(u128), + /// Hex strings are used to initialize global variables + Hex(Vec), + + // DECLARATION KEYWORDS + // -------------------------------------------------------------------------------------------- + /// Used to declare kernel modules. Also used to declare a function with kernel calling convention. + Kernel, + /// Used to declare normal modules. + Module, + /// Used to declare a global variable with internal linkage. + Internal, + /// Used to declare a global variable with "one definition rule" linkage. + Odr, + /// Used to declare a global variable with external linkage. + External, + /// Keyword to declare that a function is publicly visible. + Pub, + /// Keyword to declare that a function is publicly visible. + Fn, + /// Keyword to declare a function's calling convention. + Cc, + /// Keyword to declare that a function uses fast calling convention. + Fast, + /// Keyword to declare that a function parameter is a struct return + Sret, + /// Keyword to declare that a function parameter is extended with 0s when filling up a word + /// Also used as an operation to pad a value with 0s. + Zext, + /// Keyword to declare that a function parameter is extended with sign bits when filling up a word + /// Also used as an operation to pad a value with sign bits. + Sext, + + // OPERATION KEYWORDS + // -------------------------------------------------------------------------------------------- + /// Keyword to return from a function + Ret, + /// Keyword to call a function in user space + Call, + /// Keyword to call a function in kernel space + SysCall, + /// Keyword to perform a conditional jump + Cond, + /// Keyword to perform an unconditional jump + Branch, + /// Keyword to perform a multi-branch conditional jump + Switch, + /// Keyword to test whether a value has a specific type + Test, + /// Keyword to load a value from memory. Also used for the load operation on globals + Load, + /// Keyword to copy data from one memory location to another + MemCpy, + /// Keyword to indicate a sequence of assembly instructions + InlineAsm, + /// Keyword to indicate a memory management operation + Memory, + /// Keyword to indicate that the currently assigned amount of memory should grow + Grow, + /// Keyword to perform an addition + Add, + /// Keyword to perform a subtraction + Sub, + /// Keyword to perform a multiplication + Mul, + /// Keyword to perform a division + Div, + /// Keyword to determine a minimum value + Min, + /// Keyword to determine a maximum value + Max, + /// Keyword to perform a modulo + Mod, + /// Keyword to perform a division modulo 2^32 + DivMod, + /// Keyword to perform an exponentiation + Exp, + /// Keyword to perform a boolean and + And, + /// Keyword to perform a bitwise and + BAnd, + /// Keyword to perform a boolean or + Or, + /// Keyword to perform a bitwise or + BOr, + /// Keyword to perform a boolean xor + Xor, + /// Keyword to perform a bitwise xor + BXor, + /// Keyword to perform a left shift + Shl, + /// Keyword to perform a right shift + Shr, + /// Keyword to perform a left rotation + Rotl, + /// Keyword to perform a right rotation + Rotr, + /// Keyword to test for equality + Eq, + /// Keyword to test for inequality + Neq, + /// Keyword to test for greater-than + Gt, + /// Keyword to test for greater-than-or-equal + Gte, + /// Keyword to test for less-than + Lt, + /// Keyword to test for less-than-or-equal + Lte, + /// Keyword to perform a store + Store, + /// Keyword to perform an addition with an immediate parameter + AddImm, + /// Keyword to perform a subtraction with an immediate parameter + SubImm, + /// Keyword to perform a multiplication with an immediate parameter + MulImm, + /// Keyword to perform a division with an immediate parameter + DivImm, + /// Keyword to determine a minimum with an immediate parameter + MinImm, + /// Keyword to determine a maximum with an immediate parameter + MaxImm, + /// Keyword to perform a modulo with an immediate parameter + ModImm, + /// Keyword to perform a division modulo 2^32 with an immediate parameter + DivModImm, + /// Keyword to perform an exponentiation with an immediate parameter + ExpImm, + /// Keyword to perform a boolaen and with an immediate parameter + AndImm, + /// Keyword to perform a bitwise and with an immediate parameter + BAndImm, + /// Keyword to perform a boolean or with an immediate parameter + OrImm, + /// Keyword to perform a bitwise or with an immediate parameter + BOrImm, + /// Keyword to perform a boolean xor with an immediate parameter + XorImm, + /// Keyword to perform a bitwise xor with an immediate parameter + BXorImm, + /// Keyword to perform a left shift with an immediate parameter + ShlImm, + /// Keyword to perform a right shift with an immediate parameter + ShrImm, + /// Keyword to perform a left rotation with an immediate parameter + RotlImm, + /// Keyword to perform a right rotation with an immediate parameter + RotrImm, + /// Keyword to perform an inversion within the field + Inv, + /// Keyword to perform an increment + Incr, + /// Keyword to perform a power-of-2 operation + Pow2, + /// Keyword to perform a boolean negation + Not, + /// Keyword to perform a bitwise negation + BNot, + /// Keyword to count the number of set bits in a value + PopCnt, + /// Keyword to check if a value is odd + IsOdd, + /// Keyword to perform a type cast + Cast, + /// Keyword to cast a pointer to an integer + PtrToInt, + /// Keyword to cast an integer to a pointer + IntToPtr, + /// Keyword to truncate a word + TruncW, + /// Keyword to perform a numerical negation + Neg, + /// Keyword to indicate an immediate unary operation (used alongside the type of the immediate) + Const, + /// Keyword to access elements in a struct + Select, + /// Keyword to perform an assertion + Assert, + /// Keyword to perform an 0-check assertion + Assertz, + /// Keyword to allocate an array + Alloca, + /// Keyword to indicate an unreachable part of the code + Unreachable, + /// Keyword used to indicate a global variable operation + Global, + /// Keyword used for type casts in global variable operations + As, + /// Keyword used to indicate the symbol global variable operation + Symbol, + /// Keyword used to indicate the iadd global variable operation + IAdd, + /// Keyword to indicated an unchecked aritmetic operation + Unchecked, + /// Keyword to indicated a checked aritmetic operation + Checked, + /// Keyword to indicated a wrapping aritmetic operation + Wrapping, + /// Keyword to indicated an overflowing aritmetic operation + Overflowing, + + // TYPES + // -------------------------------------------------------------------------------------------- + I1, + I8, + U8, + I16, + U16, + I32, + U32, + I64, + U64, + I128, + U128, + U256, + ISize, + USize, + F64, + Felt, + Mut, + + // PUNCTUATION + // -------------------------------------------------------------------------------------------- + DoubleQuote, + Colon, + ColonColon, + Semicolon, + Comma, + Dot, + LParen, + RParen, + LBracket, + RBracket, + LBrace, + RBrace, + Equal, + RDoubleArrow, + Plus, + Minus, + RArrow, + Star, + Ampersand, + Bang, + At, +} +impl Token { + pub fn from_keyword_or_ident(s: &str) -> Self { + match s { + "kernel" => Self::Kernel, + "module" => Self::Module, + "internal" => Self::Internal, + "odr" => Self::Odr, + "external" => Self::External, + "pub" => Self::Pub, + "cc" => Self::Cc, + "fast" => Self::Fast, + "sret" => Self::Sret, + "zext" => Self::Zext, + "sext" => Self::Sext, + "ret" => Self::Ret, + "call" => Self::Call, + "syscall" => Self::SysCall, + "cond" => Self::Cond, + "branch" => Self::Branch, + "switch" => Self::Switch, + "test" => Self::Test, + "load" => Self::Load, + "memcpy" => Self::MemCpy, + "inlineasm" => Self::InlineAsm, + "memory" => Self::Memory, + "grow" => Self::Grow, + "add" => Self::Add, + "sub" => Self::Sub, + "mul" => Self::Mul, + "div" => Self::Div, + "min" => Self::Min, + "max" => Self::Max, + "mod" => Self::Mod, + "divmod" => Self::DivMod, + "exp" => Self::Exp, + "and" => Self::And, + "band" => Self::BAnd, + "or" => Self::Or, + "bor" => Self::BOr, + "xor" => Self::Xor, + "bxor" => Self::BXor, + "shl" => Self::Shl, + "shr" => Self::Shr, + "rotl" => Self::Rotl, + "rotr" => Self::Rotr, + "eq" => Self::Eq, + "neq" => Self::Neq, + "gt" => Self::Gt, + "gte" => Self::Gte, + "lt" => Self::Lt, + "lte" => Self::Lte, + "store" => Self::Store, + "add_imm" => Self::AddImm, + "sub_imm" => Self::SubImm, + "mul_imm" => Self::MulImm, + "div_imm" => Self::DivImm, + "min_imm" => Self::MinImm, + "max_imm" => Self::MaxImm, + "mod_imm" => Self::ModImm, + "divmod_imm" => Self::DivModImm, + "exp_imm" => Self::ExpImm, + "and_imm" => Self::AndImm, + "band_imm" => Self::BAndImm, + "or_imm" => Self::OrImm, + "bor_imm" => Self::BOrImm, + "xor_imm" => Self::XorImm, + "bxor_imm" => Self::BXorImm, + "shl_imm" => Self::ShlImm, + "shr_imm" => Self::ShrImm, + "rotl_imm" => Self::RotlImm, + "rotr_imm" => Self::RotrImm, + "inv" => Self::Inv, + "incr" => Self::Incr, + "pow2" => Self::Pow2, + "not" => Self::Not, + "bnot" => Self::BNot, + "popcnt" => Self::PopCnt, + "is_odd" => Self::IsOdd, + "cast" => Self::Cast, + "ptrtoint" => Self::PtrToInt, + "inttoprt" => Self::IntToPtr, + "truncw" => Self::TruncW, + "neg" => Self::Neg, + "const" => Self::Const, + "select" => Self::Select, + "assert" => Self::Assert, + "assertz" => Self::Assertz, + "alloca" => Self::Alloca, + "unreachable" => Self::Unreachable, + "as" => Self::As, + "global" => Self::Global, + "symbol" => Self::Symbol, + "iadd" => Self::IAdd, + "unchecked" => Self::Unchecked, + "checked" => Self::Checked, + "wrapping" => Self::Wrapping, + "overflowing" => Self::Overflowing, + "i1" => Self::I1, + "i8" => Self::I8, + "u8" => Self::U8, + "i16" => Self::I16, + "u16" => Self::U16, + "i32" => Self::I32, + "u32" => Self::U32, + "i64" => Self::I64, + "u64" => Self::U64, + "i128" => Self::I128, + "u128" => Self::U128, + "u256" => Self::U256, + "isize" => Self::ISize, + "usize" => Self::USize, + "f64" => Self::F64, + "felt" => Self::Felt, + "mut" => Self::Mut, + other => Self::Ident(Symbol::intern(other)), + } + } +} +impl Eq for Token {} +impl PartialEq for Token { + fn eq(&self, other: &Token) -> bool { + match self { + Self::Num(i) => { + if let Self::Num(i2) = other { + return *i == *i2; + } + } + Self::Error(_) => { + if let Self::Error(_) = other { + return true; + } + } + Self::Ident(i) => { + if let Self::Ident(i2) = other { + return i == i2; + } + } + _ => return mem::discriminant(self) == mem::discriminant(other), + } + false + } +} +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Eof => write!(f, "EOF"), + Self::Error(_) => write!(f, "ERROR"), + Self::Comment => write!(f, "COMMENT"), + Self::Ident(ref id) => write!(f, "{}", id), + Self::Num(ref i) => write!(f, "{}", i), + Self::Hex(ref data) => { + write!(f, "0x")?; + for i in data.iter().rev() { + write!(f, "{:02x}", i)?; + } + Ok(()) + } + Self::Kernel => write!(f, "kernel"), + Self::Module => write!(f, "module"), + Self::Internal => write!(f, "internal"), + Self::Odr => write!(f, "odr"), + Self::External => write!(f, "external"), + Self::Pub => write!(f, "pub"), + Self::Fn => write!(f, "fn"), + Self::Cc => write!(f, "cc"), + Self::Fast => write!(f, "fast"), + Self::Sret => write!(f, "sret"), + Self::Zext => write!(f, "zext"), + Self::Sext => write!(f, "sext"), + Self::Ret => write!(f, "ret"), + Self::Call => write!(f, "call"), + Self::SysCall => write!(f, "syscall"), + Self::Cond => write!(f, "cond"), + Self::Branch => write!(f, "branch"), + Self::Switch => write!(f, "switch"), + Self::Test => write!(f, "test"), + Self::Load => write!(f, "load"), + Self::MemCpy => write!(f, "memcpy"), + Self::InlineAsm => write!(f, "inlineasm"), + Self::Memory => write!(f, "memory"), + Self::Grow => write!(f, "grow"), + Self::Add => write!(f, "add"), + Self::Sub => write!(f, "sub"), + Self::Mul => write!(f, "mul"), + Self::Div => write!(f, "div"), + Self::Min => write!(f, "min"), + Self::Max => write!(f, "max"), + Self::Mod => write!(f, "mod"), + Self::DivMod => write!(f, "divmod"), + Self::Exp => write!(f, "exp"), + Self::And => write!(f, "and"), + Self::BAnd => write!(f, "band"), + Self::Or => write!(f, "or"), + Self::BOr => write!(f, "bor"), + Self::Xor => write!(f, "xor"), + Self::BXor => write!(f, "bxor"), + Self::Shl => write!(f, "shl"), + Self::Shr => write!(f, "shr"), + Self::Rotl => write!(f, "rotl"), + Self::Rotr => write!(f, "rotr"), + Self::Eq => write!(f, "eq"), + Self::Neq => write!(f, "neq"), + Self::Gt => write!(f, "gt"), + Self::Gte => write!(f, "gte"), + Self::Lt => write!(f, "lt"), + Self::Lte => write!(f, "lte"), + Self::Store => write!(f, "store"), + Self::AddImm => write!(f, "add_imm"), + Self::SubImm => write!(f, "sub_imm"), + Self::MulImm => write!(f, "mul_imm"), + Self::DivImm => write!(f, "div_imm"), + Self::MinImm => write!(f, "min_imm"), + Self::MaxImm => write!(f, "max_imm"), + Self::ModImm => write!(f, "mod_imm"), + Self::DivModImm => write!(f, "divmod_imm"), + Self::ExpImm => write!(f, "exp_imm"), + Self::AndImm => write!(f, "and_imm"), + Self::BAndImm => write!(f, "band_imm"), + Self::OrImm => write!(f, "or_imm"), + Self::BOrImm => write!(f, "bor_imm"), + Self::XorImm => write!(f, "xor_imm"), + Self::BXorImm => write!(f, "bxor_imm"), + Self::ShlImm => write!(f, "shl_imm"), + Self::ShrImm => write!(f, "shr_imm"), + Self::RotlImm => write!(f, "rotl_imm"), + Self::RotrImm => write!(f, "rotr_imm"), + Self::Inv => write!(f, "inv"), + Self::Incr => write!(f, "incr"), + Self::Pow2 => write!(f, "pow2"), + Self::Not => write!(f, "not"), + Self::BNot => write!(f, "bnot"), + Self::PopCnt => write!(f, "popcnt"), + Self::IsOdd => write!(f, "is_odd"), + Self::Cast => write!(f, "cast"), + Self::PtrToInt => write!(f, "ptrtoint"), + Self::IntToPtr => write!(f, "inttoptr"), + Self::TruncW => write!(f, "truncw"), + Self::Neg => write!(f, "neg"), + Self::Const => write!(f, "const"), + Self::Select => write!(f, "select"), + Self::Assert => write!(f, "assert"), + Self::Assertz => write!(f, "assertz"), + Self::Alloca => write!(f, "alloca"), + Self::Unreachable => write!(f, "unreachable"), + Self::Global => write!(f, "global"), + Self::As => write!(f, "as"), + Self::Symbol => write!(f, "symbol"), + Self::IAdd => write!(f, "iadd"), + Self::Unchecked => write!(f, "unchecked"), + Self::Checked => write!(f, "checked"), + Self::Wrapping => write!(f, "wrapping"), + Self::Overflowing => write!(f, "overflowing"), + Self::I1 => write!(f, "i1"), + Self::I8 => write!(f, "i8"), + Self::U8 => write!(f, "u8"), + Self::I16 => write!(f, "i16"), + Self::U16 => write!(f, "u16"), + Self::I32 => write!(f, "i32"), + Self::U32 => write!(f, "u32"), + Self::I64 => write!(f, "i64"), + Self::U64 => write!(f, "u64"), + Self::I128 => write!(f, "i128"), + Self::U128 => write!(f, "u128"), + Self::U256 => write!(f, "u256"), + Self::ISize => write!(f, "isize"), + Self::USize => write!(f, "usize"), + Self::F64 => write!(f, "f64"), + Self::Felt => write!(f, "felt"), + Self::Mut => write!(f, "mut"), + Self::DoubleQuote => write!(f, "\""), + Self::Colon => write!(f, ":"), + Self::ColonColon => write!(f, "::"), + Self::Semicolon => write!(f, ";"), + Self::Comma => write!(f, ","), + Self::Dot => write!(f, "."), + Self::LParen => write!(f, "("), + Self::RParen => write!(f, ")"), + Self::LBracket => write!(f, "["), + Self::RBracket => write!(f, "]"), + Self::LBrace => write!(f, "{{"), + Self::RBrace => write!(f, "}}"), + Self::Equal => write!(f, "="), + Self::RDoubleArrow => write!(f, "=>"), + Self::Plus => write!(f, "+"), + Self::Minus => write!(f, "-"), + Self::RArrow => write!(f, "->"), + Self::Star => write!(f, "*"), + Self::Ampersand => write!(f, "&"), + Self::Bang => write!(f, "!"), + Self::At => write!(f, "@"), + } + } +} + +macro_rules! pop { + ($lex:ident) => {{ + $lex.skip(); + }}; + ($lex:ident, $code:expr) => {{ + $lex.skip(); + $code + }}; +} + +macro_rules! pop2 { + ($lex:ident) => {{ + $lex.skip(); + $lex.skip(); + }}; + ($lex:ident, $code:expr) => {{ + $lex.skip(); + $lex.skip(); + $code + }}; +} + +/// The lexer that is used to perform lexical analysis on the Miden IR grammar. The lexer implements +/// the `Iterator` trait, so in order to retrieve the tokens, you simply have to iterate over it. +/// +/// # Errors +/// +/// Because the lexer is implemented as an iterator over tokens, this means that you can continue +/// to get tokens even if a lexical error occurs. The lexer will attempt to recover from an error +/// by injecting tokens it expects. +/// +/// If an error is unrecoverable, the lexer will continue to produce tokens, but there is no +/// guarantee that parsing them will produce meaningful results, it is primarily to assist in +/// gathering as many errors as possible. +pub struct Lexer { + /// The scanner produces a sequence of chars + location, and can be controlled + /// The location type is SourceIndex + scanner: Scanner, + + /// The most recent token to be lexed. + /// At the start and end, this should be Token::Eof + token: Token, + + /// The position in the input where the current token starts + /// At the start this will be the byte index of the beginning of the input + token_start: SourceIndex, + + /// The position in the input where the current token ends + /// At the start this will be the byte index of the beginning of the input + token_end: SourceIndex, + + /// When we have reached true Eof, this gets set to true, and the only token + /// produced after that point is Token::Eof, or None, depending on how you are + /// consuming the lexer + eof: bool, +} +impl Lexer +where + S: Source, +{ + /// Produces an instance of the lexer with the lexical analysis to be performed on the `input` + /// string. Note that no lexical analysis occurs until the lexer has been iterated over. + pub fn new(scanner: Scanner) -> Self { + use miden_diagnostics::ByteOffset; + + let start = scanner.start(); + let mut lexer = Lexer { + scanner, + token: Token::Eof, + token_start: start + ByteOffset(0), + token_end: start + ByteOffset(0), + eof: false, + }; + lexer.advance(); + lexer + } + + pub fn lex(&mut self) -> Option<::Item> { + if self.eof && self.token == Token::Eof { + return None; + } + + let token = std::mem::replace(&mut self.token, Token::Eof); + let start = self.token_start; + let end = self.token_end; + self.advance(); + match token { + Token::Error(err) => Some(Err(err.into())), + token => Some(Ok((start, token, end))), + } + } + + fn advance(&mut self) { + self.advance_start(); + self.token = self.tokenize(); + } + + #[inline] + fn advance_start(&mut self) { + let mut position: SourceIndex; + loop { + let (pos, c) = self.scanner.read(); + + position = pos; + + if c == '\0' { + self.eof = true; + return; + } + + if c.is_whitespace() { + self.scanner.advance(); + continue; + } + + break; + } + + self.token_start = position; + } + + #[inline] + fn pop(&mut self) -> char { + use miden_diagnostics::ByteOffset; + + let (pos, c) = self.scanner.pop(); + self.token_end = pos + ByteOffset::from_char_len(c); + c + } + + #[inline] + fn peek(&mut self) -> char { + let (_, c) = self.scanner.peek(); + c + } + + #[inline] + fn read(&mut self) -> char { + let (_, c) = self.scanner.read(); + c + } + + #[inline] + fn skip(&mut self) { + self.pop(); + } + + /// Get the span for the current token in `Source`. + #[inline] + fn span(&self) -> SourceSpan { + SourceSpan::new(self.token_start, self.token_end) + } + + /// Get a string slice of the current token. + #[inline] + fn slice(&self) -> &str { + self.scanner.slice(self.span()) + } + + #[inline] + fn skip_whitespace(&mut self) { + let mut c: char; + loop { + c = self.read(); + + if !c.is_whitespace() { + break; + } + + self.skip(); + } + } + + fn tokenize(&mut self) -> Token { + let c = self.read(); + + if c == '\\' { + match self.peek() { + '\\' => { + self.skip(); + self.skip(); + self.lex_comment() + } + _ => Token::Error(LexicalError::UnexpectedCharacter { + start: self.span().start(), + found: c, + }), + }; + } + + if c == '\0' { + self.eof = true; + return Token::Eof; + } + + if c.is_whitespace() { + self.skip_whitespace(); + } + + match self.read() { + ',' => pop!(self, Token::Comma), + '.' => pop!(self, Token::Dot), + ':' => match self.peek() { + ':' => pop2!(self, Token::ColonColon), + _ => pop!(self, Token::Colon), + }, + '"' => pop!(self, Token::DoubleQuote), + '(' => pop!(self, Token::LParen), + ')' => pop!(self, Token::RParen), + '[' => pop!(self, Token::LBracket), + ']' => pop!(self, Token::RBracket), + '{' => pop!(self, Token::LBrace), + '}' => pop!(self, Token::RBrace), + '=' => match self.peek() { + '>' => pop2!(self, Token::RDoubleArrow), + _ => pop!(self, Token::Equal), + }, + '+' => pop!(self, Token::Plus), + '-' => match self.peek() { + '>' => pop2!(self, Token::RArrow), + _ => pop!(self, Token::Minus), + }, + '*' => pop!(self, Token::Star), + '&' => pop!(self, Token::Ampersand), + '!' => pop!(self, Token::Bang), + '@' => pop!(self, Token::At), + '0' => match self.peek() { + 'x' => { + self.skip(); + self.skip(); + self.lex_hex() + } + '0'..='9' => self.lex_number(), + _ => Token::Error(LexicalError::UnexpectedCharacter { + start: self.span().start(), + found: c, + }), + }, + '1'..='9' => self.lex_number(), + 'a'..='z' => self.lex_keyword_or_ident(), + 'A'..='Z' => self.lex_identifier(), + c => Token::Error(LexicalError::UnexpectedCharacter { + start: self.span().start(), + found: c, + }), + } + } + + fn lex_comment(&mut self) -> Token { + let mut c; + loop { + c = self.read(); + + if c == '\n' { + break; + } + + if c == '\0' { + self.eof = true; + break; + } + + self.skip(); + } + + Token::Comment + } + + #[inline] + fn lex_keyword_or_ident(&mut self) -> Token { + let c = self.pop(); + debug_assert!(c.is_ascii_alphabetic() && c.is_lowercase()); + + self.skip_ident(); + + Token::from_keyword_or_ident(self.slice()) + } + + #[inline] + fn lex_identifier(&mut self) -> Token { + let c = self.pop(); + debug_assert!(c.is_ascii_alphabetic()); + + self.skip_ident(); + Token::Ident(Symbol::intern(self.slice())) + } + + fn skip_ident(&mut self) { + loop { + match self.read() { + '_' => self.skip(), + '0'..='9' => self.skip(), + c if c.is_ascii_alphabetic() => self.skip(), + _ => break, + } + } + } + + #[inline] + fn lex_number(&mut self) -> Token { + let mut num = String::new(); + + // Expect the first character to be a digit + debug_assert!(self.read().is_ascii_digit()); + + while let '0'..='9' = self.read() { + num.push(self.pop()); + } + + match num.parse::() { + Ok(i) => Token::Num(i), + Err(err) => Token::Error(LexicalError::InvalidInt { + span: self.span(), + reason: err.kind().clone(), + }), + } + } + + #[inline] + fn lex_hex(&mut self) -> Token { + let mut res: Vec = Vec::new(); + + loop { + match self.read() { + '0'..='9' | 'a'..='f' | 'A'..='F' => { + res.push(self.pop() as u8); + } + _ => { + break; + } + } + } + + Token::Hex(res) + } +} + +impl Iterator for Lexer +where + S: Source, +{ + type Item = Lexed; + + fn next(&mut self) -> Option { + let mut res = self.lex(); + while let Some(Ok((_, Token::Comment, _))) = res { + res = self.lex(); + } + res + } +} diff --git a/hir-parser/src/lib.rs b/hir-parser/src/lib.rs new file mode 100644 index 000000000..a78424e14 --- /dev/null +++ b/hir-parser/src/lib.rs @@ -0,0 +1,107 @@ +#[macro_use] +extern crate lalrpop_util; + +pub mod ast; +mod lexer; +mod parser; +pub mod symbols; + +pub use self::parser::{ParseError, Parser}; +pub use self::symbols::Symbol; + +use std::path::Path; +use std::sync::Arc; + +use miden_diagnostics::{CodeMap, DiagnosticsHandler}; + +/// Parses the provided source and returns the AST. +pub fn parse( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + source: &str, +) -> Result { + let parser = Parser::new((), codemap); + match parser.parse_string::(diagnostics, source) { + Ok(ast) => Ok(ast), + Err(ParseError::Lexer(err)) => { + diagnostics.emit(err); + Err(ParseError::Failed) + } + Err(err) => Err(err), + } +} + +/// Parses the provided source and returns the AST. +pub fn parse_file>( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + source: P, +) -> Result { + let parser = Parser::new((), codemap); + match parser.parse_file::(diagnostics, source) { + Ok(ast) => Ok(ast), + Err(ParseError::Lexer(err)) => { + diagnostics.emit(err); + Err(ParseError::Failed) + } + Err(err) => Err(err), + } +} + +/// Parses the provided source string with a default [CodeMap] and [DiagnosticsHandler]. +/// +/// This is primarily provided for use in tests, you should generally prefer [parse] +pub fn parse_str(source: &str) -> Result { + use miden_diagnostics::{ + term::termcolor::ColorChoice, DefaultEmitter, DiagnosticsConfig, Verbosity, + }; + + let codemap = Arc::new(CodeMap::new()); + let emitter = Arc::new(DefaultEmitter::new(ColorChoice::Auto)); + let config = DiagnosticsConfig { + verbosity: Verbosity::Warning, + warnings_as_errors: true, + no_warn: false, + display: Default::default(), + }; + let diagnostics = DiagnosticsHandler::new(config, codemap.clone(), emitter); + parse(&diagnostics, codemap, source) +} + +/// Parses a [Module] from the given path. +/// +/// This is primarily intended for use in the import resolution phase. +pub(crate) fn parse_module_from_file>( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + path: P, +) -> Result { + let parser = Parser::new((), codemap); + match parser.parse_file::(diagnostics, path) { + ok @ Ok(_) => ok, + Err(ParseError::Lexer(err)) => { + diagnostics.emit(err); + Err(ParseError::Failed) + } + err @ Err(_) => err, + } +} + +/// Parses a [Module] from a file already in the codemap +/// +/// This is primarily intended for use in the import resolution phase. +pub(crate) fn parse_module( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + source: Arc, +) -> Result { + let parser = Parser::new((), codemap); + match parser.parse::(diagnostics, source) { + ok @ Ok(_) => ok, + Err(ParseError::Lexer(err)) => { + diagnostics.emit(err); + Err(ParseError::Failed) + } + err @ Err(_) => err, + } +} diff --git a/hir-parser/src/parser/grammar.lalrpop b/hir-parser/src/parser/grammar.lalrpop new file mode 100644 index 000000000..c1f50bb03 --- /dev/null +++ b/hir-parser/src/parser/grammar.lalrpop @@ -0,0 +1,757 @@ +use std::sync::Arc; + +use miden_diagnostics::{CodeMap, DiagnosticsHandler}; + +use crate::{ + ast::*, + lexer::Token, + parser::ParseError, + Symbol +}; + +grammar(diagnostics: &DiagnosticsHandler, codemap: &Arc, next_var: &mut usize); + +// MACROS +// ================================================================================================ + +// Comma-delimited with at least one element +Comma: Vec = { + ",")*> => { + let mut v = v; + v.push(e); + v + } +}; + +// ColonColon-delimited with at least one element +ColonColon: Vec = { + "::")*> => { + let mut v = v; + v.push(e); + v + } +}; + +// AST NODE +// ================================================================================================ + +pub Program: Program = { + => { + let mut p = Program::new(span!(l, r), modules, globals); + if let Some(name) = entry { + p.with_entry_point(name); + } + p + } +} + +pub Module: Module = { + "kernel" => { + Module::new(span!(l, r), ModuleType::Kernel, name, functions, externals) + }, + "module" => { + Module::new(span!(l, r), ModuleType::Module, name, functions, externals) + }, +} + +// GLOBALS +// ================================================================================================ + +GlobalVarDeclaration: GlobalVarDeclaration = { + => { + let mut v = GlobalVarDeclaration::new(span!(l, r), name, ty, linkage); + if let Some(e) = init { + v.with_init(e); + } + v + } +} + +Linkage: Linkage = { + "internal" => Linkage::Internal, + "odr" => Linkage::Odr, + "external" => Linkage::Internal, +} + +GlobalVarInitializer: Constant = { + "=" => { + Constant::new(e) + } +} + +// TYPES +// ============================================================================================== + +Type: Type = { + "(" ")" => Type::Unit, + "!" => Type::Never, + "i1" => Type::I1, + "i8" => Type::I8, + "u8" => Type::U8, + "i16" => Type::I16, + "u16" => Type::U16, + "i32" => Type::I32, + "u32" => Type::U32, + "i64" => Type::I64, + "u64" => Type::U64, + "i128" => Type::I128, + "u128" => Type::U128, + "u256" => Type::U256, + "isize" => Type::ISize, + "usize" => Type::USize, + "f64" => Type::F64, + "felt" => Type::Felt, + "*" "mut" => Type::Ptr(Box::new(inner)), + "&" "mut" => Type::NativePtr(Box::new(inner)), + "{" > "}" => Type::Struct(field_types), + "{" "}" => Type::Struct(Vec::new()), + "[" ";" "]" => Type::Array(Box::new(inner), length), +} + +// FUNCTIONS +// ============================================================================================== + +ExternalFunction: FunctionSignature = { + ";" => signature +} + +CallConvention: CallConvention = { + "cc" "(" "fast" ")" => CallConvention::Fast, + "cc" "(" "kernel" ")" => CallConvention::Kernel, +} + +ParamPurpose: ParameterPurpose = { + => ParameterPurpose::Standard, + "sret" => ParameterPurpose::Sret +} + +ParamExtension: ParameterExtension = { + "zext" => ParameterExtension::Zero, + "sext" => ParameterExtension::Signed, +} + +FunctionReturn: FunctionReturn = { + => { + let ext = if let Some(e) = extension { e } else { ParameterExtension::None }; + FunctionReturn::new(ext, ty) + } +} + +FunctionReturnSignature: Vec = { + "->" > => returns +} + +FunctionParam: FunctionParameter = { + => { + let ext = if let Some(e) = extension { e } else { ParameterExtension::None }; + FunctionParameter::new(purpose, ext, ty) + } +} + +FunctionParams: Vec = { + "(" ")" => Vec::new(), + "(" > ")" => params, +} + +FunctionSignature: FunctionSignature = { + "pub" "fn" => { + let cc = if let Some(cc) = call_convention { cc } else { CallConvention::Default }; + let ret = if let Some(r) = returns { r } else { Vec::new() }; + FunctionSignature::new(span!(l, r), Visibility::Public, cc, name, params, ret) + }, + "fn" => { + let cc = if let Some(cc) = call_convention { cc } else { CallConvention::Default }; + let ret = if let Some(r) = returns { r } else { Vec::new() }; + FunctionSignature::new(span!(l, r), Visibility::Private, cc, name, params, ret) + }, +} + +FunctionDeclaration: FunctionDeclaration = { + "{" "}" => { + FunctionDeclaration::new(span!(l, r), signature, blocks) + } +} + +// BLOCKS +// ================================================================================================ + +Label: Label = { + => { + Label::new(id) + } +} + +BlockArg: BlockArgument = { + ":" => { + BlockArgument::new(value, ty) + } +} + +BlockArgs: Vec = { + "(" > ")" => args, +} + +BlockHeader: BlockHeader = { + ":" => { + let a = if let Some(args) = arguments { args } else { Vec::new() }; + BlockHeader::new(label, a) + }, +} + +Block: Block = { + "{" "}" => { + Block::new(span!(l, r), header, instructions) + } +} + +// INSTRUCTIONS +// ================================================================================================ + +Overflow: Overflow = { + "." "unchecked" => Overflow::Unchecked, + "." "checked" => Overflow::Checked, + "." "wrapping" => Overflow::Wrapping, + "." "overflowing" => Overflow::Overflowing, +} + +BinaryOpCode: BinaryOpCode = { + "add" => { + BinaryOpCode::Add(overflow) + }, + "sub" => { + BinaryOpCode::Sub(overflow) + }, + "mul" => { + BinaryOpCode::Mul(overflow) + }, + "div" => { + BinaryOpCode::Div(overflow) + }, + "min" => { + BinaryOpCode::Min(overflow) + }, + "max" => { + BinaryOpCode::Max(overflow) + }, + "mod" => { + BinaryOpCode::Mod(overflow) + }, + "divmod" => { + BinaryOpCode::DivMod(overflow) + }, + "exp" => { + BinaryOpCode::Exp(overflow) + }, + "and" => { + BinaryOpCode::And + }, + "band" => { + BinaryOpCode::BAnd(overflow) + }, + "or" => { + BinaryOpCode::Or + }, + "bor" => { + BinaryOpCode::BOr(overflow) + }, + "xor" => { + BinaryOpCode::Xor + }, + "bxor" => { + BinaryOpCode::BXor(overflow) + }, + "shl" => { + BinaryOpCode::Shl(overflow) + }, + "shr" => { + BinaryOpCode::Shr(overflow) + }, + "rotl" => { + BinaryOpCode::Rotl(overflow) + }, + "rotr" => { + BinaryOpCode::Rotr(overflow) + }, + "eq" => { + BinaryOpCode::Eq + }, + "neq" => { + BinaryOpCode::Neq + }, + "gt" => { + BinaryOpCode::Gt + }, + "gte" => { + BinaryOpCode::Gte + }, + "lt" => { + BinaryOpCode::Lt + }, + "lte" => { + BinaryOpCode::Lte + }, + "store" => { + BinaryOpCode::Store + }, +} + +BinaryImmOpCode: BinaryImmOpCode = { + "add_imm" => { + BinaryImmOpCode::AddImm(overflow) + }, + "sub_imm" => { + BinaryImmOpCode::SubImm(overflow) + }, + "mul_imm" => { + BinaryImmOpCode::MulImm(overflow) + }, + "div_imm" => { + BinaryImmOpCode::DivImm(overflow) + }, + "min_imm" => { + BinaryImmOpCode::MinImm(overflow) + }, + "max_imm" => { + BinaryImmOpCode::MaxImm(overflow) + }, + "mod_imm" => { + BinaryImmOpCode::ModImm(overflow) + }, + "divmod_imm" => { + BinaryImmOpCode::DivModImm(overflow) + }, + "exp_imm" => { + BinaryImmOpCode::ExpImm(overflow) + }, + "and_imm" => { + BinaryImmOpCode::AndImm + }, + "band_imm" => { + BinaryImmOpCode::BAndImm(overflow) + }, + "or_imm" => { + BinaryImmOpCode::OrImm + }, + "bor_imm" => { + BinaryImmOpCode::BOrImm(overflow) + }, + "xor_imm" => { + BinaryImmOpCode::XorImm + }, + "bxor_imm" => { + BinaryImmOpCode::BXorImm(overflow) + }, + "shl_imm" => { + BinaryImmOpCode::ShlImm(overflow) + }, + "shr_imm" => { + BinaryImmOpCode::ShrImm(overflow) + }, + "rotl_imm" => { + BinaryImmOpCode::RotlImm(overflow) + }, + "rotr_imm" => { + BinaryImmOpCode::RotrImm(overflow) + }, +} + +UnaryOpCode: UnaryOpCode = { + "inv" => { + UnaryOpCode::Inv + }, + "incr" => { + UnaryOpCode::Incr + }, + "pow2" => { + UnaryOpCode::Pow2 + }, + "not" => { + UnaryOpCode::Not + }, + "bnot" => { + UnaryOpCode::BNot + }, + "popcnt" => { + UnaryOpCode::PopCnt + }, + "is_odd" => { + UnaryOpCode::IsOdd + }, + "cast" => { + UnaryOpCode::Cast + }, + "ptrtoint" => { + UnaryOpCode::PtrToInt + }, + "inttoprt" => { + UnaryOpCode::IntToPtr + }, + "truncw" => { + UnaryOpCode::TruncW + }, + "zext" => { + UnaryOpCode::Zext + }, + "sext" => { + UnaryOpCode::Sext + }, + "neg" => { + UnaryOpCode::Neg + }, +} + +UnaryImmOpCode: UnaryImmOpCode = { + "const" "." "i1" => UnaryImmOpCode::I1, + "const" "." "i8" => UnaryImmOpCode::I8, + "const" "." "i16" => UnaryImmOpCode::I16, + "const" "." "i32" => UnaryImmOpCode::I32, + "const" "." "i64" => UnaryImmOpCode::I64, + "const" "." "isize" => UnaryImmOpCode::ISize, + "const" "." "felt" => UnaryImmOpCode::Felt, + "const" "." "f64" => UnaryImmOpCode::F64, +} + +Offset: Offset = { + "+" => { + Offset::Pos(val) + }, + "-" => { + Offset::Neg(val) + } +} + +GlobalValueOperationNested: GlobalValueOpNested = { + "@" => { + if let Some(o) = offset { + GlobalValueOpNested::Symbol(id, o) + } + else { + GlobalValueOpNested::Symbol(id, Offset::Pos(0)) + } + }, + "*" => { + GlobalValueOpNested::Load(Box::new(nested), Offset::Pos(0)) + }, + "*" "(" ")" => { + GlobalValueOpNested::Load(Box::new(nested), offset) + }, + "*" "(" ")" "as" => { + if let Some(o) = offset { + GlobalValueOpNested::Cast(Box::new(nested), o, ty) + } + else { + GlobalValueOpNested::Cast(Box::new(nested), Offset::Pos(0), ty) + } + }, +} + +GlobalValueOperation: GlobalValueOp = { + "global" "." "symbol" "@" => { + if let Some(o) = offset { + GlobalValueOp::Symbol(id, o) + } + else { + GlobalValueOp::Symbol(id, Offset::Pos(0)) + } + }, + "global" "." "load" => { + GlobalValueOp::Load(nested, Offset::Pos(0)) + }, + "global" "." "load" "(" ")" => { + GlobalValueOp::Load(nested, offset) + }, + "global" "." "load" "(" ")" "as" => { + if let Some(o) = offset { + GlobalValueOp::Cast(nested, o, ty) + } + else { + GlobalValueOp::Cast(nested, Offset::Pos(0), ty) + } + }, + "global" "." "iadd" "." "." => { + GlobalValueOp::IAddImm(number, ty, nested) + }, +} + +CallOp: CallOp = { + "call" => CallOp::Call, + "syscall" => CallOp::SysCall, +} + +SwitchBranch: SwitchBranch = { + "=>" => { + SwitchBranch::Test(value, label) + }, + => { + SwitchBranch::Default(label) + } +} + +Destination: Destination = { + => { + let a = if let Some(args) = arguments { args } else { Vec::new() }; + Destination::new(label, a) + }, +} + +Operation: Operation = { + => { + Operation::BinaryOp(op, val1, val2) + }, + => { + Operation::BinaryImmOp(op, val, imm) + }, + => { + Operation::UnaryOp(op, val) + }, + => { + Operation::UnaryImmOp(op, imm) + }, + "ret" "(" > ")" => { + Operation::ReturnOp(vals) + }, + "ret" => { + Operation::ReturnOp(Vec::new()) + }, + "(" > ")" => { + Operation::CallOp(op, f, args) + }, + "(" ")" => { + Operation::CallOp(op, f, Vec::new()) + }, + "cond" "," "," => { + Operation::CondOp(val, dest1, dest2) + }, + "branch" => { + Operation::BranchOp(dest) + }, + "switch" "," > => { + Operation::SwitchOp(val, branches) + }, + "test" "." => { + Operation::TestOp(ty, val) + }, + "select" "," "," => { + let args = vec![cond, a, b]; + Operation::PrimOp(PrimOpCode::Select, args) + }, + "assert" => { + let args = vec![val]; + Operation::PrimOp(PrimOpCode::Assert, args) + }, + "assertz" => { + let args = vec![val]; + Operation::PrimOp(PrimOpCode::Assertz, args) + }, + "assert" "." "eq" "," => { + let args = vec![lhs, rhs]; + Operation::PrimOp(PrimOpCode::AssertEq, args) + }, + "alloca" => { + Operation::PrimOp(PrimOpCode::Alloca, Vec::new()) + }, + "unreachable" => { + Operation::PrimOp(PrimOpCode::Unreachable, Vec::new()) + }, + "load" => { + Operation::LoadOp(val) + }, + "memcpy" "." "," "," => { + Operation::MemCpyOp(ty, val1, val2, val3) + }, +// TODO: Inline assembly +// "inlineasm" "\"" "\"" "," > => ..., +// "inlineasm" "\"" "\"" "," => ..., + => { + Operation::GlobalValueOp(op) + } +// TODO: MemGrow +// "memory" "." "grow" => ..., +} + +Instruction: Instruction = { + > "=" ":" > => { + Instruction::new(span!(l, r), values, op, types) + }, + => { + Instruction::new(span!(l, r), Vec::new(), op, Vec::new()) + }, +} + + +// VALUES AND IDENTIFIERS +// ================================================================================================ + +HexString: Vec = { + hex, +} + +Number: u128 = { + int, +} + +Immediate: Immediate = { + => Immediate::Pos(val), + "-" => Immediate::Neg(val), +} + +Identifier: Identifier = { + => Identifier::new(span!(l, r), name) +} + +Value: Value = { + => Value::new(id) +} + +FunctionIdentifier: FunctionIdentifier = { + > => FunctionIdentifier::new(span!(l, r), names) +} + + + +// LEXER +// ================================================================================================ + +extern { + type Error = ParseError; + type Location = miden_diagnostics::SourceIndex; + + enum Token { + identifier => Token::Ident(), + int => Token::Num(), + hex => Token::Hex(>), + "kernel" => Token::Kernel, + "module" => Token::Module, + "internal" => Token::Internal, + "odr" => Token::Odr, + "external" => Token::External, + "pub" => Token::Pub, + "fn" => Token::Fn, + "cc" => Token::Cc, + "fast" => Token::Fast, + "sret" => Token::Sret, + "zext" => Token::Zext, + "sext" => Token::Sext, + "ret" => Token::Ret, + "call" => Token::Call, + "syscall" => Token::SysCall, + "cond" => Token::Cond, + "branch" => Token::Branch, + "switch" => Token::Switch, + "test" => Token::Test, + "load" => Token::Load, + "memcpy" => Token::MemCpy, + "inlineasm" => Token::InlineAsm, + "memory" => Token::Memory, + "grow" => Token::Grow, + "add" => Token::Add, + "sub" => Token::Sub, + "mul" => Token::Mul, + "div" => Token::Div, + "min" => Token::Min, + "max" => Token::Max, + "mod" => Token::Mod, + "divmod" => Token::DivMod, + "exp" => Token::Exp, + "and" => Token::And, + "band" => Token::BAnd, + "or" => Token::Or, + "bor" => Token::BOr, + "xor" => Token::Xor, + "bxor" => Token::BXor, + "shl" => Token::Shl, + "shr" => Token::Shr, + "rotl" => Token::Rotl, + "rotr" => Token::Rotr, + "eq" => Token::Eq, + "neq" => Token::Neq, + "gt" => Token::Gt, + "gte" => Token::Gte, + "lt" => Token::Lt, + "lte" => Token::Lte, + "store" => Token::Store, + "add_imm" => Token::AddImm, + "sub_imm" => Token::SubImm, + "mul_imm" => Token::MulImm, + "div_imm" => Token::DivImm, + "min_imm" => Token::MinImm, + "max_imm" => Token::MaxImm, + "mod_imm" => Token::ModImm, + "divmod_imm" => Token::DivModImm, + "exp_imm" => Token::ExpImm, + "and_imm" => Token::AndImm, + "band_imm" => Token::BAndImm, + "or_imm" => Token::OrImm, + "bor_imm" => Token::BOrImm, + "xor_imm" => Token::XorImm, + "bxor_imm" => Token::BXorImm, + "shl_imm" => Token::ShlImm, + "shr_imm" => Token::ShrImm, + "rotl_imm" => Token::RotlImm, + "rotr_imm" => Token::RotrImm, + "inv" => Token::Inv, + "incr" => Token::Incr, + "pow2" => Token::Pow2, + "not" => Token::Not, + "bnot" => Token::BNot, + "popcnt" => Token::PopCnt, + "is_odd" => Token::IsOdd, + "cast" => Token::Cast, + "ptrtoint" => Token::PtrToInt, + "inttoprt" => Token::IntToPtr, + "truncw" => Token::TruncW, + "neg" => Token::Neg, + "const" => Token::Const, + "select" => Token::Select, + "assert" => Token::Assert, + "assertz" => Token::Assertz, + "alloca" => Token::Alloca, + "unreachable" => Token::Unreachable, + "unchecked" => Token::Unchecked, + "checked" => Token::Checked, + "wrapping" => Token::Wrapping, + "overflowing" => Token::Overflowing, + "i1" => Token::I1, + "i8" => Token::I8, + "u8" => Token::U8, + "i16" => Token::I16, + "u16" => Token::U16, + "i32" => Token::I32, + "u32" => Token::U32, + "i64" => Token::I64, + "u64" => Token::U64, + "i128" => Token::I128, + "u128" => Token::U128, + "u256" => Token::U256, + "isize" => Token::ISize, + "usize" => Token::USize, + "f64" => Token::F64, + "felt" => Token::Felt, + "mut" => Token::Mut, + "as" => Token::As, + "global" => Token::Global, + "symbol" => Token::Symbol, + "iadd" => Token::IAdd, + "\"" => Token::DoubleQuote, + "=" => Token::Equal, + "=>" => Token::RDoubleArrow, + "+" => Token::Plus, + "-" => Token::Minus, + "->" => Token::RArrow, + "*" => Token::Star, + "&" => Token::Ampersand, + "!" => Token::Bang, + ":" => Token::Colon, + "::" => Token::ColonColon, + ";" => Token::Semicolon, + "," => Token::Comma, + "[" => Token::LBracket, + "]" => Token::RBracket, + "(" => Token::LParen, + ")" => Token::RParen, + "{" => Token::LBrace, + "}" => Token::RBrace, + "." => Token::Dot, + "@" => Token::At, + } +} diff --git a/hir-parser/src/parser/mod.rs b/hir-parser/src/parser/mod.rs new file mode 100644 index 000000000..9b53f7aa7 --- /dev/null +++ b/hir-parser/src/parser/mod.rs @@ -0,0 +1,260 @@ +// Simple macro used in the grammar definition for constructing spans +macro_rules! span { + ($l:expr, $r:expr) => { + miden_diagnostics::SourceSpan::new($l, $r) + }; + ($i:expr) => { + miden_diagnostics::SourceSpan::new($i, $i) + }; +} + +lalrpop_mod!( + #[allow(clippy::all)] + grammar, + "/parser/grammar.rs" +); + +use std::sync::Arc; + +use miden_diagnostics::{ + CodeMap, Diagnostic, DiagnosticsHandler, Label, SourceIndex, SourceSpan, ToDiagnostic, +}; +use miden_parsing::{Scanner, Source}; + +use crate::{ + ast, + lexer::{Lexed, Lexer, LexicalError, Token}, +}; + +pub type Parser = miden_parsing::Parser<()>; + +#[derive(Debug, thiserror::Error)] +pub enum ParseError { + #[error(transparent)] + Lexer(#[from] LexicalError), + #[error("error reading {path:?}: {source}")] + FileError { + source: std::io::Error, + path: std::path::PathBuf, + }, + #[error("invalid token")] + InvalidToken(SourceIndex), + #[error("unexpected end of file")] + UnexpectedEof { + at: SourceIndex, + expected: Vec, + }, + #[error("unrecognized token '{token}'")] + UnrecognizedToken { + span: SourceSpan, + token: Token, + expected: Vec, + }, + #[error("extraneous token '{token}'")] + ExtraToken { span: SourceSpan, token: Token }, + #[error("parsing failed, see diagnostics for details")] + Failed, +} +impl Eq for ParseError {} +impl PartialEq for ParseError { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Lexer(l), Self::Lexer(r)) => l == r, + (Self::FileError { .. }, Self::FileError { .. }) => true, + (Self::InvalidToken(_), Self::InvalidToken(_)) => true, + ( + Self::UnexpectedEof { + expected: ref l, .. + }, + Self::UnexpectedEof { + expected: ref r, .. + }, + ) => l == r, + ( + Self::UnrecognizedToken { + token: lt, + expected: ref l, + .. + }, + Self::UnrecognizedToken { + token: rt, + expected: ref r, + .. + }, + ) => lt == rt && l == r, + (Self::ExtraToken { token: l, .. }, Self::ExtraToken { token: r, .. }) => l == r, + (Self::Failed, Self::Failed) => true, + _ => false, + } + } +} +impl From> for ParseError { + fn from(err: lalrpop_util::ParseError) -> Self { + use lalrpop_util::ParseError as LError; + + match err { + LError::InvalidToken { location } => Self::InvalidToken(location), + LError::UnrecognizedEof { + location: at, + expected, + } => Self::UnexpectedEof { at, expected }, + LError::UnrecognizedToken { + token: (l, token, r), + expected, + } => Self::UnrecognizedToken { + span: SourceSpan::new(l, r), + token, + expected, + }, + LError::ExtraToken { + token: (l, token, r), + } => Self::ExtraToken { + span: SourceSpan::new(l, r), + token, + }, + LError::User { error } => error, + } + } +} +impl ToDiagnostic for ParseError { + fn to_diagnostic(self) -> Diagnostic { + match self { + Self::Lexer(err) => err.to_diagnostic(), + Self::InvalidToken(start) => Diagnostic::error() + .with_message("invalid token") + .with_labels(vec![Label::primary( + start.source_id(), + SourceSpan::new(start, start), + )]), + Self::UnexpectedEof { at, ref expected } => { + let mut message = "expected one of: ".to_string(); + for (i, t) in expected.iter().enumerate() { + if i == 0 { + message.push_str(&format!("'{}'", t)); + } else { + message.push_str(&format!(", '{}'", t)); + } + } + + Diagnostic::error() + .with_message("unexpected eof") + .with_labels(vec![Label::primary( + at.source_id(), + SourceSpan::new(at, at), + ) + .with_message(message)]) + } + Self::UnrecognizedToken { + span, ref expected, .. + } => { + let mut message = "expected one of: ".to_string(); + for (i, t) in expected.iter().enumerate() { + if i == 0 { + message.push_str(&format!("'{}'", t)); + } else { + message.push_str(&format!(", '{}'", t)); + } + } + + Diagnostic::error() + .with_message("unexpected token") + .with_labels(vec![ + Label::primary(span.source_id(), span).with_message(message) + ]) + } + Self::ExtraToken { span, .. } => Diagnostic::error() + .with_message("extraneous token") + .with_labels(vec![Label::primary(span.source_id(), span)]), + err => Diagnostic::error().with_message(err.to_string()), + } + } +} + +impl miden_parsing::Parse for ast::Program { + type Parser = grammar::ProgramParser; + type Error = ParseError; + type Config = (); + type Token = Lexed; + + fn root_file_error(source: std::io::Error, path: std::path::PathBuf) -> Self::Error { + ParseError::FileError { source, path } + } + + fn parse( + parser: &Parser, + diagnostics: &DiagnosticsHandler, + source: S, + ) -> Result + where + S: Source, + { + let scanner = Scanner::new(source); + let lexer = Lexer::new(scanner); + Self::parse_tokens(diagnostics, parser.codemap.clone(), lexer) + } + + fn parse_tokens>( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + tokens: S, + ) -> Result { + let mut next_var = 0; + let result = Self::Parser::new().parse(diagnostics, &codemap, &mut next_var, tokens); + match result { + Ok(ast) => { + if diagnostics.has_errors() { + return Err(ParseError::Failed); + } + Ok(ast) + } + Err(lalrpop_util::ParseError::User { error }) => Err(error), + Err(err) => Err(err.into()), + } + } +} + +impl miden_parsing::Parse for ast::Module { + type Parser = grammar::ModuleParser; + type Error = ParseError; + type Config = (); + type Token = Lexed; + + fn root_file_error(source: std::io::Error, path: std::path::PathBuf) -> Self::Error { + ParseError::FileError { source, path } + } + + fn parse( + parser: &Parser, + diagnostics: &DiagnosticsHandler, + source: S, + ) -> Result + where + S: Source, + { + let scanner = Scanner::new(source); + let lexer = Lexer::new(scanner); + Self::parse_tokens(diagnostics, parser.codemap.clone(), lexer) + } + + fn parse_tokens>( + diagnostics: &DiagnosticsHandler, + codemap: Arc, + tokens: S, + ) -> Result { + let mut next_var = 0; + let result = Self::Parser::new().parse(diagnostics, &codemap, &mut next_var, tokens); + match result { + Ok(ast) => { + if diagnostics.has_errors() { + return Err(ParseError::Failed); + } + Ok(ast) + } + Err(lalrpop_util::ParseError::User { error }) => Err(error), + Err(err) => Err(err.into()), + } + } +} + +#[cfg(test)] +mod tests; diff --git a/hir-parser/src/symbols.rs b/hir-parser/src/symbols.rs new file mode 100644 index 000000000..edccfe518 --- /dev/null +++ b/hir-parser/src/symbols.rs @@ -0,0 +1,163 @@ +use core::fmt; +use core::mem; +use core::ops::Deref; +use core::str; + +use std::collections::BTreeMap; +use std::sync::RwLock; + +lazy_static::lazy_static! { + static ref SYMBOL_TABLE: SymbolTable = SymbolTable::new(); +} + +struct SymbolTable { + interner: RwLock, +} +impl SymbolTable { + pub fn new() -> Self { + Self { + interner: RwLock::new(Interner::new()), + } + } +} +unsafe impl Sync for SymbolTable {} + +/// A symbol is an interned string. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct Symbol(SymbolIndex); + +impl Symbol { + #[inline] + pub const fn new(n: u32) -> Self { + Self(SymbolIndex::new(n)) + } + + /// Maps a string to its interned representation. + pub fn intern>(string: S) -> Self { + let string = string.into(); + with_interner(|interner| interner.intern(string)) + } + + pub fn as_str(self) -> &'static str { + with_read_only_interner(|interner| unsafe { + // This is safe because the interned string will live for the + // lifetime of the program + mem::transmute::<&str, &'static str>(interner.get(self)) + }) + } + + #[inline] + pub fn as_u32(self) -> u32 { + self.0.as_u32() + } + + #[inline] + pub fn as_usize(self) -> usize { + self.0.as_usize() + } +} +impl fmt::Debug for Symbol { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}({:?})", self, self.0) + } +} +impl fmt::Display for Symbol { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.as_str(), f) + } +} +impl PartialOrd for Symbol { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl Ord for Symbol { + fn cmp(&self, other: &Self) -> core::cmp::Ordering { + self.as_str().cmp(other.as_str()) + } +} +impl> PartialEq for Symbol { + fn eq(&self, other: &T) -> bool { + self.as_str() == other.deref() + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct SymbolIndex(u32); +impl SymbolIndex { + // shave off 256 indices at the end to allow space for packing these indices into enums + pub const MAX_AS_U32: u32 = 0xFFFF_FF00; + + #[inline] + const fn new(n: u32) -> Self { + assert!(n <= Self::MAX_AS_U32, "out of range value used"); + + SymbolIndex(n) + } + + #[inline] + pub fn as_u32(self) -> u32 { + self.0 + } + + #[inline] + pub fn as_usize(self) -> usize { + self.0 as usize + } +} +impl From for u32 { + #[inline] + fn from(v: SymbolIndex) -> u32 { + v.as_u32() + } +} +impl From for usize { + #[inline] + fn from(v: SymbolIndex) -> usize { + v.as_usize() + } +} + +#[derive(Default)] +struct Interner { + pub names: BTreeMap<&'static str, Symbol>, + pub strings: Vec<&'static str>, +} + +impl Interner { + pub fn new() -> Self { + let this = Interner::default(); + this + } + + pub fn intern(&mut self, string: String) -> Symbol { + if let Some(&name) = self.names.get(string.as_str()) { + return name; + } + + let name = Symbol::new(self.strings.len() as u32); + + let string = string.into_boxed_str(); + let string: &'static str = Box::leak(string); + self.strings.push(string); + self.names.insert(string, name); + name + } + + pub fn get(&self, symbol: Symbol) -> &str { + self.strings[symbol.0.as_usize()] + } +} + +// If an interner exists, return it. Otherwise, prepare a fresh one. +#[inline] +fn with_interner T>(f: F) -> T { + let mut r = SYMBOL_TABLE.interner.write().unwrap(); + f(&mut r) +} + +#[inline] +fn with_read_only_interner T>(f: F) -> T { + let r = SYMBOL_TABLE.interner.read().unwrap(); + f(&r) +} diff --git a/hir/src/write.rs b/hir/src/write.rs index a062f1880..4cf9d0894 100644 --- a/hir/src/write.rs +++ b/hir/src/write.rs @@ -11,9 +11,11 @@ pub fn write_function(w: &mut dyn Write, func: &Function) -> fmt::Result { } write_block_header(w, func, block, 4)?; + writeln!(w, "{{")?; for inst in block_data.insts() { write_instruction(w, func, inst, 4)?; } + writeln!(w, "}}")?; } writeln!(w, "}}") } @@ -160,7 +162,14 @@ fn write_operands(w: &mut dyn Write, dfg: &DataFlowGraph, inst: Inst) -> fmt::Re Instruction::BinaryOpImm(BinaryOpImm { arg, imm, .. }) => write!(w, " {}, {}", arg, imm), Instruction::UnaryOp(UnaryOp { arg, .. }) => write!(w, " {}", arg), Instruction::UnaryOpImm(UnaryOpImm { imm, .. }) => write!(w, " {}", imm), - Instruction::Ret(Ret { args, .. }) => write!(w, " {}", DisplayValues(args.as_slice(pool))), + Instruction::Ret(Ret { args, .. }) => { + if args.len(pool) > 0 { + write!(w, " ({})", DisplayValues(args.as_slice(pool))) + } + else { + Ok(()) + } + }, Instruction::RetImm(RetImm { arg, .. }) => write!(w, " {arg}"), Instruction::Call(Call { callee, args, .. }) => { write!(w, " {}({})", callee, DisplayValues(args.as_slice(pool))) @@ -178,21 +187,13 @@ fn write_operands(w: &mut dyn Write, dfg: &DataFlowGraph, inst: Inst) -> fmt::Re write_block_args(w, else_dest.1.as_slice(pool)) } Instruction::Br(Br { - op, destination, args, .. - }) if *op == Opcode::Br => { + }) => { write!(w, " {}", destination)?; write_block_args(w, args.as_slice(pool)) } - Instruction::Br(Br { - destination, args, .. - }) => { - let args = args.as_slice(pool); - write!(w, " {}, {}", args[0], destination)?; - write_block_args(w, &args[1..]) - } Instruction::Switch(Switch { arg, arms, default, .. }) => {