Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: tokenize keywords and identifiers correctly #428

Merged
merged 8 commits into from
Jan 10, 2025
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions quil-rs/proptest-regressions/expression/mod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
# everyone who runs the test benefits from these saved cases.
cc 4c32128d724ed0f840715fae4e194c99262dc153c64be39d2acf45b8903b20f7 # shrinks to value = Complex { re: 0.0, im: -0.13530277317700273 }
cc 5cc95f2159ad7120bbaf296d3a9fb26fef30f57b61e76b3e0dc99f4759009fdb # shrinks to e = Number(Complex { re: 0.0, im: -2.772221265116396 })
cc de70a1853ccef983fac85a87761ba08bfb2d54b2d4e880d5d90e7b4a75ecafb5 # shrinks to e = Address(MemoryReference { name: "mut", index: 0 })
cc 9ad50859b68cb403ce1a67af0feef1f55d25587466878e364ba2810be5910b14 # shrinks to e = Address(MemoryReference { name: "iNf", index: 0 })
152 changes: 46 additions & 106 deletions quil-rs/src/parser/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ mod error;
mod quoted_strings;
mod wrapped_parsers;

use std::str::FromStr;

use nom::{
bytes::complete::{is_a, tag_no_case, take_till, take_while, take_while1},
bytes::complete::{is_a, take_till, take_while, take_while1},
character::complete::{digit1, one_of},
combinator::{all_consuming, map, recognize, value},
multi::many0,
Expand All @@ -28,7 +30,7 @@ use nom::{
use nom_locate::LocatedSpan;
use wrapped_parsers::{alt, tag};

pub use super::token::{Token, TokenWithLocation};
pub use super::token::{KeywordToken, Token, TokenWithLocation};
use crate::parser::lexer::wrapped_parsers::expecting;
use crate::parser::token::token_with_location;
pub(crate) use error::InternalLexError;
Expand Down Expand Up @@ -92,7 +94,7 @@ pub enum Command {
Xor,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum DataType {
Bit,
Expand All @@ -101,7 +103,7 @@ pub enum DataType {
Integer,
}

#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "UPPERCASE")]
pub enum Modifier {
Controlled,
Expand Down Expand Up @@ -163,30 +165,17 @@ fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
"a token",
(
token_with_location(lex_comment),
token_with_location(lex_data_type),
token_with_location(lex_modifier),
token_with_location(lex_punctuation),
token_with_location(lex_target),
token_with_location(lex_string),
// Operator must come before number (or it may be parsed as a prefix)
token_with_location(lex_operator),
token_with_location(lex_number),
token_with_location(lex_variable),
token_with_location(lex_non_blocking),
// This should come last because it's sort of a catch all
token_with_location(lex_command_or_identifier),
),
)(input)
}

fn lex_data_type(input: LexInput) -> InternalLexResult {
alt(
"a data type",
(
value(Token::DataType(DataType::Bit), tag("BIT")),
value(Token::DataType(DataType::Integer), tag("INTEGER")),
value(Token::DataType(DataType::Octet), tag("OCTET")),
value(Token::DataType(DataType::Real), tag("REAL")),
// Identifiers must come before numbers so that `NaN`, `Inf`, and `Infinity` aren't
// parsed as floats; Nom, as of version 7.1.1, will parse those strings,
// case-insensitively, as floats
token_with_location(lex_keyword_or_identifier),
token_with_location(lex_number),
),
)(input)
}
Expand All @@ -197,62 +186,16 @@ fn lex_comment(input: LexInput) -> InternalLexResult {
Ok((input, Token::Comment(content.to_string())))
}

/// If the given identifier string matches a command keyword, return the keyword;
/// otherwise, return the original identifier as a token.
fn recognize_command_or_identifier(identifier: String) -> Token {
use Command::*;

match identifier.as_str() {
"DEFGATE" => Token::Command(DefGate),
"ADD" => Token::Command(Add),
"AND" => Token::Command(And),
"CALL" => Token::Command(Call),
"CONVERT" => Token::Command(Convert),
"DIV" => Token::Command(Div),
"EQ" => Token::Command(Eq),
"EXCHANGE" => Token::Command(Exchange),
"GE" => Token::Command(GE),
"GT" => Token::Command(GT),
"IOR" => Token::Command(Ior),
"LE" => Token::Command(LE),
"LOAD" => Token::Command(Load),
"LT" => Token::Command(LT),
"MOVE" => Token::Command(Move),
"MUL" => Token::Command(Mul),
"NEG" => Token::Command(Neg),
"NOT" => Token::Command(Not),
"STORE" => Token::Command(Store),
"SUB" => Token::Command(Sub),
"XOR" => Token::Command(Xor),
"DEFCIRCUIT" => Token::Command(DefCircuit),
"MEASURE" => Token::Command(Measure),
"HALT" => Token::Command(Halt),
"WAIT" => Token::Command(Wait),
"JUMP-WHEN" => Token::Command(JumpWhen),
"JUMP-UNLESS" => Token::Command(JumpUnless),
"JUMP" => Token::Command(Jump),
"RESET" => Token::Command(Reset),
"NOP" => Token::Command(Nop),
"INCLUDE" => Token::Command(Include),
"PRAGMA" => Token::Command(Pragma),
"DECLARE" => Token::Command(Declare),
"CAPTURE" => Token::Command(Capture),
"DEFCAL" => Token::Command(DefCal),
"DEFFRAME" => Token::Command(DefFrame),
"DEFWAVEFORM" => Token::Command(DefWaveform),
"DELAY" => Token::Command(Delay),
"FENCE" => Token::Command(Fence),
"PULSE" => Token::Command(Pulse),
"RAW-CAPTURE" => Token::Command(RawCapture),
"SET-FREQUENCY" => Token::Command(SetFrequency),
"SET-PHASE" => Token::Command(SetPhase),
"SET-SCALE" => Token::Command(SetScale),
"SWAP-PHASES" => Token::Command(SwapPhases),
"SHIFT-FREQUENCY" => Token::Command(ShiftFrequency),
"SHIFT-PHASE" => Token::Command(ShiftPhase),
"LABEL" => Token::Command(Label),
_ => Token::Identifier(identifier),
fn keyword_or_identifier(identifier: String) -> Token {
fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
T::from_str(identifier).map(token)
}

parse(KeywordToken::into, &identifier)
.or_else(|_| parse(Token::Command, &identifier))
.or_else(|_| parse(Token::DataType, &identifier))
.or_else(|_| parse(Token::Modifier, &identifier))
.unwrap_or(Token::Identifier(identifier))
}

fn is_valid_identifier_leading_character(chr: char) -> bool {
Expand Down Expand Up @@ -286,9 +229,9 @@ fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
)(input)
}

fn lex_command_or_identifier(input: LexInput) -> InternalLexResult {
fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
let (input, identifier) = lex_identifier_raw(input)?;
let token = recognize_command_or_identifier(identifier);
let token = keyword_or_identifier(identifier);
Ok((input, token))
}

Expand All @@ -298,10 +241,6 @@ fn lex_target(input: LexInput) -> InternalLexResult {
Ok((input, Token::Target(label)))
}

fn lex_non_blocking(input: LexInput) -> InternalLexResult {
value(Token::NonBlocking, tag("NONBLOCKING"))(input)
}

fn lex_number(input: LexInput) -> InternalLexResult {
let (input, float_string): (LexInput, LexInput) = recognize(double)(input)?;
let integer_parse_result: IResult<LexInput, _> = all_consuming(digit1)(float_string);
Expand All @@ -318,24 +257,6 @@ fn lex_number(input: LexInput) -> InternalLexResult {
))
}

fn lex_modifier(input: LexInput) -> InternalLexResult {
alt(
"a modifier token",
(
value(Token::As, tag("AS")),
value(Token::Matrix, tag("MATRIX")),
value(Token::Modifier(Modifier::Controlled), tag("CONTROLLED")),
value(Token::Modifier(Modifier::Dagger), tag("DAGGER")),
value(Token::Modifier(Modifier::Forked), tag("FORKED")),
value(Token::Mutable, tag_no_case("MUT")),
value(Token::Offset, tag("OFFSET")),
value(Token::PauliSum, tag("PAULI-SUM")),
value(Token::Permutation, tag("PERMUTATION")),
value(Token::Sharing, tag("SHARING")),
),
)(input)
}

fn lex_operator(input: LexInput) -> InternalLexResult {
use Operator::*;
map(
Expand Down Expand Up @@ -399,7 +320,7 @@ mod tests {
use nom_locate::LocatedSpan;
use rstest::*;

use crate::parser::common::tests::KITCHEN_SINK_QUIL;
use crate::parser::{common::tests::KITCHEN_SINK_QUIL, DataType};

use super::{lex, Command, Operator, Token};

Expand Down Expand Up @@ -565,10 +486,29 @@ mod tests {
case("a", vec![Token::Identifier("a".to_string())]),
case("_a-2_b-2_", vec![Token::Identifier("_a-2_b-2_".to_string())]),
case("a-2-%var", vec![
Token::Identifier("a-2".to_string()),
Token::Operator(Operator::Minus),
Token::Variable("var".to_string())
])
Token::Identifier("a-2".to_string()),
Token::Operator(Operator::Minus),
Token::Variable("var".to_string())
]),
case("BIT", vec![Token::DataType(DataType::Bit)]),
case("BITS", vec![Token::Identifier("BITS".to_string())]),
case("NaN", vec![Token::Identifier("NaN".to_string())]),
case("nan", vec![Token::Identifier("nan".to_string())]),
case("NaNa", vec![Token::Identifier("NaNa".to_string())]),
case("nana", vec![Token::Identifier("nana".to_string())]),
antalsz marked this conversation as resolved.
Show resolved Hide resolved
case("INF", vec![Token::Identifier("INF".to_string())]),
case("Infinity", vec![Token::Identifier("Infinity".to_string())]),
case("Inferior", vec![Token::Identifier("Inferior".to_string())]),
case("-NaN", vec![Token::Operator(Operator::Minus), Token::Identifier("NaN".to_string())]),
case("-inf", vec![Token::Operator(Operator::Minus), Token::Identifier("inf".to_string())]),
case("-Infinity", vec![
Token::Operator(Operator::Minus),
Token::Identifier("Infinity".to_string())
]),
case("-inferior", vec![
Token::Operator(Operator::Minus),
Token::Identifier("inferior".to_string())
]),
)]
fn it_lexes_identifier(input: &str, expected: Vec<Token>) {
let input = LocatedSpan::new(input);
Expand Down
4 changes: 2 additions & 2 deletions quil-rs/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ mod token;

pub(crate) use error::{ErrorInput, InternalParseError};
pub use error::{ParseError, ParserErrorKind};
pub use lexer::LexError;
pub use token::{Token, TokenWithLocation};
pub use lexer::{Command, DataType, LexError, Modifier};
pub use token::{KeywordToken, Token, TokenWithLocation};

pub(crate) type ParserInput<'a> = &'a [TokenWithLocation<'a>];
type InternalParserResult<'a, R, E = InternalParseError<'a>> = IResult<ParserInput<'a>, R, E>;
Expand Down
89 changes: 81 additions & 8 deletions quil-rs/src/parser/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,79 @@ where
}
}

/// The subset of [`Token`]s which (a) do not have arguments and (b) are keywords. Used to ensure
/// that keyword-checking remains in sync with the definition of [`Token`].
#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
pub enum KeywordToken {
As,
Matrix,
#[strum(serialize = "mut")]
Mutable,
#[strum(serialize = "NONBLOCKING")]
NonBlocking,
Offset,
PauliSum,
Permutation,
Sharing,
}

impl From<KeywordToken> for Token {
fn from(token: KeywordToken) -> Self {
match token {
KeywordToken::As => Token::As,
KeywordToken::Matrix => Token::Matrix,
KeywordToken::Mutable => Token::Mutable,
KeywordToken::NonBlocking => Token::NonBlocking,
KeywordToken::Offset => Token::Offset,
KeywordToken::PauliSum => Token::PauliSum,
KeywordToken::Permutation => Token::Permutation,
KeywordToken::Sharing => Token::Sharing,
}
}
}

impl TryFrom<Token> for KeywordToken {
type Error = ();

fn try_from(token: Token) -> Result<Self, Self::Error> {
// This match is explicit so that if you add a new [`Token`] constructor you have to decide
// if it's a keyword.
#[deny(clippy::wildcard_enum_match_arm, clippy::wildcard_in_or_patterns)]
match token {
Token::As => Ok(KeywordToken::As),
Token::Matrix => Ok(KeywordToken::Matrix),
Token::Mutable => Ok(KeywordToken::Mutable),
Token::Offset => Ok(KeywordToken::Offset),
Token::PauliSum => Ok(KeywordToken::PauliSum),
Token::Permutation => Ok(KeywordToken::Permutation),
Token::Sharing => Ok(KeywordToken::Sharing),

Token::Colon
| Token::Comma
| Token::Command(_)
| Token::Comment(_)
| Token::DataType(_)
| Token::Float(_)
| Token::Identifier(_)
| Token::Indentation
| Token::Integer(_)
| Token::Target(_)
| Token::LBracket
| Token::LParenthesis
| Token::NonBlocking
| Token::Modifier(_)
| Token::NewLine
| Token::Operator(_)
| Token::RBracket
| Token::RParenthesis
| Token::Semicolon
| Token::String(_)
| Token::Variable(_) => Err(()),
}
}
}

#[derive(Clone, PartialEq)]
pub enum Token {
As,
Expand Down Expand Up @@ -102,7 +175,7 @@ pub enum Token {
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::As => write!(f, "AS"),
Token::As => write!(f, "{}", KeywordToken::As),
Token::Colon => write!(f, ":"),
Token::Comma => write!(f, ","),
Token::Command(cmd) => write!(f, "{cmd}"),
Expand All @@ -115,19 +188,19 @@ impl fmt::Display for Token {
Token::Target(label) => write!(f, "{label}"),
Token::LBracket => write!(f, "["),
Token::LParenthesis => write!(f, "("),
Token::NonBlocking => write!(f, "NONBLOCKING"),
Token::Matrix => write!(f, "MATRIX"),
Token::NonBlocking => write!(f, "{}", KeywordToken::NonBlocking),
Token::Matrix => write!(f, "{}", KeywordToken::Matrix),
Token::Modifier(m) => write!(f, "{m}"),
Token::Mutable => write!(f, "MUT"),
Token::Mutable => write!(f, "{}", KeywordToken::Mutable),
Token::NewLine => write!(f, "NEWLINE"),
Token::Operator(op) => write!(f, "{op}"),
Token::Offset => write!(f, "OFFSET"),
Token::PauliSum => write!(f, "PAULI-SUM"),
Token::Permutation => write!(f, "PERMUTATION"),
Token::Offset => write!(f, "{}", KeywordToken::Offset),
Token::PauliSum => write!(f, "{}", KeywordToken::PauliSum),
Token::Permutation => write!(f, "{}", KeywordToken::Permutation),
Token::RBracket => write!(f, "]"),
Token::RParenthesis => write!(f, ")"),
Token::Semicolon => write!(f, ";"),
Token::Sharing => write!(f, "SHARING"),
Token::Sharing => write!(f, "{}", KeywordToken::Sharing),
Token::String(s) => write!(f, "{}", QuotedString(s)),
Token::Variable(v) => write!(f, "{v}"),
}
Expand Down
Loading
Loading