Extend Lexer interface to expose diagnostics and data as a lexing rep…

…ort (#1668) - Add support to report diagnostics during lexing process - Properly map diagnostic severities - Mark method and report as optional for backwards-compatibility For indentation: - Add dedent tokens to report until consumed for state management
eclipse-langium · Sep 6, 2024 · 51d99a6 · 51d99a6
1 parent 707d2f7
commit 51d99a6
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 39 deletions.
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
@@ -4,9 +4,9 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition, TokenVocabulary } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
-import type { TokenBuilderOptions } from './token-builder.js';
+import type { LexingReport, TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
 import type { LangiumCoreServices } from '../services.js';
 import { createToken, createTokenInstance, Lexer } from 'chevrotain';
@@ -69,22 +69,28 @@ export enum LexingMode {
     IGNORE_INDENTATION = 'ignore-indentation',
 }
 
+export interface IndentationLexingReport extends LexingReport {
+    /** Dedent tokens that are necessary to close the remaining indents. */
+    remainingDedents: IToken[];
+}
+
 /**
  * A token builder that is sensitive to indentation in the input text.
  * It will generate tokens for indentation and dedentation based on the indentation level.
  *
  * The first generic parameter corresponds to the names of terminal tokens,
- * while the second one corresonds to the names of keyword tokens.
+ * while the second one corresponds to the names of keyword tokens.
  * Both parameters are optional and can be imported from `./generated/ast.js`.
  *
  * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
  */
 export class IndentationAwareTokenBuilder<Terminals extends string = string, KeywordName extends string = string> extends DefaultTokenBuilder {
     /**
-     * The stack in which all the previous matched indentation levels are stored
-     * to understand how deep a the next tokens are nested.
+     * The stack stores all the previously matched indentation levels to understand how deeply the next tokens are nested.
+     * The stack is valid for lexing
      */
     protected indentationStack: number[] = [0];
+
     readonly options: IndentationTokenBuilderOptions<Terminals, KeywordName>;
 
     /**
@@ -123,7 +129,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         });
     }
 
-    override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
+    override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined): TokenVocabulary {
         const tokenTypes = super.buildTokens(grammar, options);
         if (!isTokenTypeArray(tokenTypes)) {
             throw new Error('Invalid tokens built by default builder');
@@ -173,6 +179,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         }
     }
 
+    override popLexingReport(text: string): IndentationLexingReport {
+        const result = super.popLexingReport(text);
+        return {
+            ...result,
+            remainingDedents: this.popRemainingDedents(text),
+        };
+    }
+
     /**
      * Helper function to check if the current position is the start of a new line.
      *
@@ -191,7 +205,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param offset The current position at which to attempt a match
      * @returns The current and previous indentation levels and the matched whitespace
      */
-    protected matchWhitespace(text: string, offset: number) {
+    protected matchWhitespace(text: string, offset: number, _tokens: IToken[], _groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
         this.whitespaceRegExp.lastIndex = offset;
         const match = this.whitespaceRegExp.exec(text);
         return {
@@ -210,8 +224,8 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param offset Current position in the input string
      * @returns The indentation token instance
      */
-    protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
-        const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
+    protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number): IToken {
+        const lineNumber = this.getLineNumber(text, offset);
         return createTokenInstance(
             tokenType,
             image,
@@ -221,6 +235,17 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         );
     }
 
+    /**
+     * Helper function to get the line number at a given offset.
+     *
+     * @param text Full input string, used to calculate the line number
+     * @param offset Current position in the input string
+     * @returns The line number at the given offset
+     */
+    protected getLineNumber(text: string, offset: number): number {
+        return text.substring(0, offset).split(/\r\n|\r|\n/).length;
+    }
+
     /**
      * A custom pattern for matching indents
      *
@@ -229,14 +254,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param tokens Previously scanned Tokens
      * @param groups Token Groups
      */
-    protected indentMatcher(text: string, offset: number, tokens: IToken[], _groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
+    protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
         const { indentTokenName } = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
 
-        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
+        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups);
 
         if (currIndentLevel <= prevIndentLevel) {
             // shallower indentation (should be matched by dedent)
@@ -266,14 +291,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param tokens Previously scanned Tokens
      * @param groups Token Groups
      */
-    protected dedentMatcher(text: string, offset: number, tokens: IToken[], _groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
+    protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
         const { dedentTokenName } = this.options;
 
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
 
-        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
+        const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups);
 
         if (currIndentLevel >= prevIndentLevel) {
             // bigger indentation (should be matched by indent)
@@ -285,9 +310,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
 
         // Any dedent must match some previous indentation level.
         if (matchIndentIndex === -1) {
-            console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
-            // throwing an error would crash the language server
-            // TODO: find a way to report error diagnostics message
+            this.diagnostics.push({
+                severity: 'error',
+                message: `Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indentation stack: ${this.indentationStack}`,
+                offset,
+                length: match?.[0]?.length ?? 0,
+                line: this.getLineNumber(text, offset),
+                column: 0
+            });
             return null;
         }
 
@@ -375,9 +405,11 @@ export class IndentationAwareLexer extends DefaultLexer {
     override tokenize(text: string): LexerResult {
         const result = super.tokenize(text);
 
-        // reset the indent stack between processing of different text inputs
-        const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
+        // consuming all remaining dedents and remove them as they might not be serializable
+        const report = result.report as IndentationLexingReport;
+        const remainingDedents = report.remainingDedents;
         result.tokens.push(...remainingDedents);
+        report.remainingDedents = [];
 
         // remove any "indent-dedent" pair with an empty body as these are typically
         // added by comments or lines with just whitespace but have no real value

diff --git a/packages/langium/src/parser/langium-parser.ts b/packages/langium/src/parser/langium-parser.ts
@@ -19,11 +19,13 @@ import { isAssignment, isCrossReference, isKeyword } from '../languages/generate
 import { getExplicitRuleType, isDataTypeRule } from '../utils/grammar-utils.js';
 import { assignMandatoryProperties, getContainerOfType, linkContentToContainer } from '../utils/ast-utils.js';
 import { CstNodeBuilder } from './cst-node-builder.js';
+import type { LexingReport } from './token-builder.js';
 
 export type ParseResult<T = AstNode> = {
     value: T,
     parserErrors: IRecognitionException[],
-    lexerErrors: ILexingError[]
+    lexerErrors: ILexingError[],
+    lexerReport?: LexingReport
 }
 
 export const DatatypeSymbol = Symbol('Datatype');
@@ -240,6 +242,7 @@ export class LangiumParser extends AbstractLangiumParser {
         return {
             value: result,
             lexerErrors: lexerResult.errors,
+            lexerReport: lexerResult.report,
             parserErrors: this.wrapper.errors
         };
     }

diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
@@ -7,6 +7,7 @@
 import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
 import type { LangiumCoreServices } from '../services.js';
 import { Lexer as ChevrotainLexer } from 'chevrotain';
+import type { LexingReport, TokenBuilder } from './token-builder.js';
 
 export interface LexerResult {
     /**
@@ -21,6 +22,7 @@ export interface LexerResult {
      */
     hidden: IToken[];
     errors: ILexingError[];
+    report?: LexingReport;
 }
 
 export interface Lexer {
@@ -31,10 +33,12 @@ export interface Lexer {
 export class DefaultLexer implements Lexer {
 
     protected chevrotainLexer: ChevrotainLexer;
+    protected tokenBuilder: TokenBuilder;
     protected tokenTypes: TokenTypeDictionary;
 
-    constructor(services: LangiumCoreServices) {
-        const tokens = services.parser.TokenBuilder.buildTokens(services.Grammar, {
+    constructor( services: LangiumCoreServices) {
+        this.tokenBuilder = services.parser.TokenBuilder;
+        const tokens = this.tokenBuilder.buildTokens(services.Grammar, {
             caseInsensitive: services.LanguageMetaData.caseInsensitive
         });
         this.tokenTypes = this.toTokenTypeDictionary(tokens);
@@ -53,7 +57,8 @@ export class DefaultLexer implements Lexer {
         return {
             tokens: chevrotainResult.tokens,
             errors: chevrotainResult.errors,
-            hidden: chevrotainResult.groups.hidden ?? []
+            hidden: chevrotainResult.groups.hidden ?? [],
+            report: this.tokenBuilder.popLexingReport?.(text)
         };
     }
 

diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
@@ -4,7 +4,7 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
+import type { CustomPatternMatcherFunc, ILexingError, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
 import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
 import type { Stream } from '../utils/stream.js';
 import { Lexer } from 'chevrotain';
@@ -20,9 +20,31 @@ export interface TokenBuilderOptions {
 
 export interface TokenBuilder {
     buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary;
+    /**
+     * Produces a lexing report for the given text that was just tokenized using the tokens provided by this builder.
+     *
+     * @param text The text that was tokenized.
+     */
+    popLexingReport?(text: string): LexingReport;
+}
+
+/**
+ * A custom lexing report that can be produced by the token builder during the lexing process.
+ * Adopters need to ensure that the any custom fields are serializable so they can be sent across worker threads.
+ */
+export interface LexingReport {
+    diagnostics: LexingDiagnostic[];
+}
+
+export interface LexingDiagnostic extends ILexingError {
+    severity?: 'error' | 'warning' | 'info' | 'hint';
 }
 
 export class DefaultTokenBuilder implements TokenBuilder {
+    /**
+     * The list of diagnostics stored during the lexing process of a single text.
+     */
+    protected diagnostics: LexingDiagnostic[] = [];
 
     buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary {
         const reachableRules = stream(getAllReachableRules(grammar, false));
@@ -42,6 +64,16 @@ export class DefaultTokenBuilder implements TokenBuilder {
         return tokens;
     }
 
+    popLexingReport(_text: string): LexingReport {
+        return { diagnostics: this.popDiagnostics() };
+    }
+
+    protected popDiagnostics(): LexingDiagnostic[] {
+        const diagnostics = [...this.diagnostics];
+        this.diagnostics = [];
+        return diagnostics;
+    }
+
     protected buildTerminalTokens(rules: Stream<AbstractRule>): TokenType[] {
         return rules.filter(isTerminalRule).filter(e => !e.fragment)
             .map(terminal => this.buildTerminalToken(terminal)).toArray();

diff --git a/packages/langium/src/serializer/hydrator.ts b/packages/langium/src/serializer/hydrator.ts
@@ -18,6 +18,7 @@ import { isRootCstNode, isCompositeCstNode, isLeafCstNode, isAstNode, isReferenc
 import { streamAst } from '../utils/ast-utils.js';
 import { BiMap } from '../utils/collections.js';
 import { streamCst } from '../utils/cst-utils.js';
+import type { LexingReport } from '../parser/token-builder.js';
 
 /**
  * The hydrator service is responsible for allowing AST parse results to be sent across worker threads.
@@ -61,14 +62,20 @@ export class DefaultHydrator implements Hydrator {
 
     dehydrate(result: ParseResult<AstNode>): ParseResult<object> {
         return {
+            lexerErrors: result.lexerErrors,
+            lexerReport: result.lexerReport ? this.dehydrateLexerReport(result.lexerReport) : undefined,
             // We need to create shallow copies of the errors
             // The original errors inherit from the `Error` class, which is not transferable across worker threads
-            lexerErrors: result.lexerErrors.map(e => ({ ...e, message: e.message })),
             parserErrors: result.parserErrors.map(e => ({ ...e, message: e.message })),
             value: this.dehydrateAstNode(result.value, this.createDehyrationContext(result.value))
         };
     }
 
+    protected dehydrateLexerReport(lexerReport: LexingReport): LexingReport {
+        // By default, lexer reports are serializable
+        return lexerReport;
+    }
+
     protected createDehyrationContext(node: AstNode): DehydrateContext {
         const astNodes = new Map<AstNode, any>();
         const cstNodes = new Map<CstNode, any>();
@@ -162,6 +169,7 @@ export class DefaultHydrator implements Hydrator {
         }
         return {
             lexerErrors: result.lexerErrors,
+            lexerReport: result.lexerReport,
             parserErrors: result.parserErrors,
             value: this.hydrateAstNode(node, context) as T
         };

diff --git a/packages/langium/src/validation/document-validator.ts b/packages/langium/src/validation/document-validator.ts
@@ -18,6 +18,7 @@ import { streamAst } from '../utils/ast-utils.js';
 import { tokenToRange } from '../utils/cst-utils.js';
 import { interruptAndCheck, isOperationCancelled } from '../utils/promise-utils.js';
 import { diagnosticData } from './validation-registry.js';
+import type { LexingDiagnostic } from '../parser/token-builder.js';
 
 export interface ValidationOptions {
     /**
@@ -97,21 +98,23 @@ export class DefaultDocumentValidator implements DocumentValidator {
     }
 
     protected processLexingErrors(parseResult: ParseResult, diagnostics: Diagnostic[], _options: ValidationOptions): void {
-        for (const lexerError of parseResult.lexerErrors) {
+        const lexerDiagnostics = [...parseResult.lexerErrors, ...parseResult.lexerReport?.diagnostics ?? []] as LexingDiagnostic[];
+        for (const lexerDiagnostic of lexerDiagnostics) {
+            const severity = lexerDiagnostic?.severity ?? 'error';
             const diagnostic: Diagnostic = {
-                severity: toDiagnosticSeverity('error'),
+                severity: toDiagnosticSeverity(severity),
                 range: {
                     start: {
-                        line: lexerError.line! - 1,
-                        character: lexerError.column! - 1
+                        line: lexerDiagnostic.line! - 1,
+                        character: lexerDiagnostic.column! - 1
                     },
                     end: {
-                        line: lexerError.line! - 1,
-                        character: lexerError.column! + lexerError.length - 1
+                        line: lexerDiagnostic.line! - 1,
+                        character: lexerDiagnostic.column! + lexerDiagnostic.length - 1
                     }
                 },
-                message: lexerError.message,
-                data: diagnosticData(DocumentValidator.LexingError),
+                message: lexerDiagnostic.message,
+                data: toDiagnosticData(severity),
                 source: this.getSource()
             };
             diagnostics.push(diagnostic);
@@ -245,8 +248,26 @@ export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'h
     }
 }
 
+export function toDiagnosticData(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticData {
+    switch (severity) {
+        case 'error':
+            return diagnosticData(DocumentValidator.LexingError);
+        case 'warning':
+            return diagnosticData(DocumentValidator.LexingWarning);
+        case 'info':
+            return diagnosticData(DocumentValidator.LexingInfo);
+        case 'hint':
+            return diagnosticData(DocumentValidator.LexingHint);
+        default:
+            throw new Error('Invalid diagnostic severity: ' + severity);
+    }
+}
+
 export namespace DocumentValidator {
     export const LexingError = 'lexing-error';
+    export const LexingWarning = 'lexing-warning';
+    export const LexingInfo = 'lexing-info';
+    export const LexingHint = 'lexing-hint';
     export const ParsingError = 'parsing-error';
     export const LinkingError = 'linking-error';
 }