From 49de1c46614ebb98d6810ec1169d5ab98aa856ff Mon Sep 17 00:00:00 2001 From: David Anderson Date: Wed, 27 Sep 2023 19:53:52 -0700 Subject: [PATCH] Atomize token data. This makes tokens easier to pass between the AST and lexer. --- compiler/lexer.cpp | 35 ++++++++++++++++++++++------------- compiler/lexer.h | 6 ++++-- compiler/parser.cpp | 12 ++++++------ 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/compiler/lexer.cpp b/compiler/lexer.cpp index 6d42542e2..4c6709509 100644 --- a/compiler/lexer.cpp +++ b/compiler/lexer.cpp @@ -251,7 +251,7 @@ Lexer::SynthesizeIncludePathToken() if (!open_c) open_c = '"'; - tok->data = ke::StringPrintf("%c%s", open_c, name); + tok->atom = cc_.atom(ke::StringPrintf("%c%s", open_c, name)); } /* ftoi @@ -469,7 +469,7 @@ void Lexer::HandleDirectives() { } auto tok = PushSynthesizedToken(tSYN_PRAGMA_UNUSED, col); - tok->data = ke::Join(parts, ","); + tok->atom = cc_.atom(ke::Join(parts, ",")); } else { error(207); /* unknown #pragma */ } @@ -1002,6 +1002,7 @@ void Lexer::HandleMultiLineComment() { } void Lexer::packedstring(full_token_t* tok, char term) { + std::string data; while (true) { char c = peek(); if (c == term || c == 0) @@ -1012,19 +1013,20 @@ void Lexer::packedstring(full_token_t* tok, char term) { } if (IsNewline(c)) break; - packedstring_char(tok); + packedstring_char(&data); } + tok->atom = cc_.atom(data); } -void Lexer::packedstring_char(full_token_t* tok) { +void Lexer::packedstring_char(std::string* data) { bool is_codepoint; cell ch = litchar(kLitcharUtf8, &is_codepoint); if (ch < 0) return; if (is_codepoint) - UnicodeCodepointToUtf8(ch, &tok->data); + UnicodeCodepointToUtf8(ch, data); else - tok->data.push_back(static_cast(ch)); + data->push_back(static_cast(ch)); } /* lex(lexvalue,lexsym) Lexical Analysis @@ -1325,7 +1327,6 @@ Lexer::PushSynthesizedToken(TokenKind kind, int col) auto tok = current_token(); tok->id = kind; tok->value = 0; - tok->data.clear(); tok->atom = nullptr; tok->start.line = state_.tokline; tok->start.col = col; @@ -1681,7 +1682,6 @@ bool Lexer::lex_number(full_token_t* tok) { void Lexer::LexStringLiteral(full_token_t* tok, int flags) { tok->id = tSTRING; - tok->data.clear(); tok->atom = nullptr; tok->value = -1; // Catch consumers expecting automatic litadd(). @@ -1693,7 +1693,11 @@ void Lexer::LexStringLiteral(full_token_t* tok, int flags) { error(37); } else { advance(); - packedstring_char(tok); + + std::string data; + packedstring_char(&data); + tok->atom = cc_.atom(data); + /* invalid char declaration */ if (!match_char('\'')) error(27); /* invalid character constant (must be one character) */ @@ -2369,24 +2373,29 @@ cell Lexer::get_utf8_char() { } void Lexer::LexStringContinuation() { + ke::SaveAndSet stop_recursion(&in_string_continuation_, true); + + if (!peek(tELLIPS)) + return; + auto initial = std::move(*current_token()); assert(initial.id == tSTRING); - ke::SaveAndSet stop_recursion(&in_string_continuation_, true); - + std::string data = initial.data(); while (match(tELLIPS)) { if (match(tCHAR_LITERAL)) { - initial.data.push_back(current_token()->value); + data.push_back(current_token()->value); continue; } if (!need(tSTRING)) { lexpush(); break; } - initial.data += current_token()->data; + data += current_token()->data(); } *current_token() = std::move(initial); + current_token()->atom = cc_.atom(data); } bool Lexer::HasMacro(sp::Atom* atom) { diff --git a/compiler/lexer.h b/compiler/lexer.h index e8ed736f0..81d9cdf11 100644 --- a/compiler/lexer.h +++ b/compiler/lexer.h @@ -41,10 +41,12 @@ struct token_pos_t { struct full_token_t { int id = 0; int value = 0; - std::string data; sp::Atom* atom = nullptr; token_pos_t start; token_pos_t end; + const std::string& data() const { + return atom->str(); + } }; #define MAX_TOKEN_DEPTH 4 @@ -366,7 +368,7 @@ class Lexer void lex_float(full_token_t* tok, cell whole); cell litchar(int flags, bool* is_codepoint = nullptr); void packedstring(full_token_t* tok, char term); - void packedstring_char(full_token_t* tok); + void packedstring_char(std::string* data); bool IsSkipping() const { return skiplevel_ > 0 && (ifstack_[skiplevel_ - 1] & SKIPMODE) == SKIPMODE; diff --git a/compiler/parser.cpp b/compiler/parser.cpp index 092b74cc2..c9575a513 100644 --- a/compiler/parser.cpp +++ b/compiler/parser.cpp @@ -149,7 +149,7 @@ Parser::Parse() case tpTRYINCLUDE: { if (!lexer_->need(tSYN_INCLUDE_PATH)) break; - auto name = lexer_->current_token()->data; + auto name = lexer_->current_token()->data(); auto result = lexer_->PlungeFile(name.c_str() + 1, (name[0] != '<'), TRUE); if (!result && tok != tpTRYINCLUDE) { report(417) << name.substr(1); @@ -601,7 +601,7 @@ Parser::parse_pragma_unused() { auto pos = lexer_->pos(); - auto data = std::move(lexer_->current_token()->data); + const auto& data = lexer_->current_token()->data(); std::vector raw_names = ke::Split(data, ","); std::vector names; for (const auto& raw_name : raw_names) @@ -1061,7 +1061,7 @@ Parser::constant() case tRATIONAL: return new FloatExpr(cc_, pos, lexer_->current_token()->value); case tSTRING: { - const auto& str = lexer_->current_token()->data; + const auto& str = lexer_->current_token()->data(); return new StringExpr(pos, str.c_str(), str.size()); } case tTRUE: @@ -1177,7 +1177,7 @@ Parser::struct_init() Expr* expr = nullptr; switch (lexer_->lex()) { case tSTRING: { - const auto& str = lexer_->current_token()->data; + const auto& str = lexer_->current_token()->data(); expr = new StringExpr(pos, str.c_str(), str.size()); break; } @@ -1218,7 +1218,7 @@ Parser::parse_static_assert() PoolString * text = nullptr; if (lexer_->match(',') && lexer_->need(tSTRING)) { auto tok = lexer_->current_token(); - text = new PoolString(tok->data.c_str(), tok->data.size()); + text = new PoolString(tok->data().c_str(), tok->data().size()); } lexer_->need(')'); @@ -1261,7 +1261,7 @@ Parser::var_init(int vclass) if (lexer_->match(tSTRING)) { auto tok = lexer_->current_token(); - return new StringExpr(tok->start, tok->data.c_str(), tok->data.size()); + return new StringExpr(tok->start, tok->data().c_str(), tok->data().size()); } // We'll check const or symbol-ness for non-sLOCALs in the semantic pass.