Skip to content

Commit

Permalink
Extend Lexer interface to expose diagnostics and data as a lexing rep…
Browse files Browse the repository at this point in the history
…ort (#1668)

- Add support to report diagnostics during lexing process
- Properly map diagnostic severities
- Mark method and report as optional for backwards-compatibility

For indentation:
- Add dedent tokens to report until consumed for state management
  • Loading branch information
martin-fleck-at authored Sep 6, 2024
1 parent 707d2f7 commit 51d99a6
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 39 deletions.
68 changes: 50 additions & 18 deletions packages/langium/src/parser/indentation-aware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition, TokenVocabulary } from 'chevrotain';
import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
import type { TokenBuilderOptions } from './token-builder.js';
import type { LexingReport, TokenBuilderOptions } from './token-builder.js';
import type { LexerResult } from './lexer.js';
import type { LangiumCoreServices } from '../services.js';
import { createToken, createTokenInstance, Lexer } from 'chevrotain';
Expand Down Expand Up @@ -69,22 +69,28 @@ export enum LexingMode {
IGNORE_INDENTATION = 'ignore-indentation',
}

export interface IndentationLexingReport extends LexingReport {
/** Dedent tokens that are necessary to close the remaining indents. */
remainingDedents: IToken[];
}

/**
* A token builder that is sensitive to indentation in the input text.
* It will generate tokens for indentation and dedentation based on the indentation level.
*
* The first generic parameter corresponds to the names of terminal tokens,
* while the second one corresonds to the names of keyword tokens.
* while the second one corresponds to the names of keyword tokens.
* Both parameters are optional and can be imported from `./generated/ast.js`.
*
* Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
*/
export class IndentationAwareTokenBuilder<Terminals extends string = string, KeywordName extends string = string> extends DefaultTokenBuilder {
/**
* The stack in which all the previous matched indentation levels are stored
* to understand how deep a the next tokens are nested.
* The stack stores all the previously matched indentation levels to understand how deeply the next tokens are nested.
* The stack is valid for lexing
*/
protected indentationStack: number[] = [0];

readonly options: IndentationTokenBuilderOptions<Terminals, KeywordName>;

/**
Expand Down Expand Up @@ -123,7 +129,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
});
}

override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined) {
override buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined): TokenVocabulary {
const tokenTypes = super.buildTokens(grammar, options);
if (!isTokenTypeArray(tokenTypes)) {
throw new Error('Invalid tokens built by default builder');
Expand Down Expand Up @@ -173,6 +179,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
}
}

override popLexingReport(text: string): IndentationLexingReport {
const result = super.popLexingReport(text);
return {
...result,
remainingDedents: this.popRemainingDedents(text),
};
}

/**
* Helper function to check if the current position is the start of a new line.
*
Expand All @@ -191,7 +205,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
* @param offset The current position at which to attempt a match
* @returns The current and previous indentation levels and the matched whitespace
*/
protected matchWhitespace(text: string, offset: number) {
protected matchWhitespace(text: string, offset: number, _tokens: IToken[], _groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
this.whitespaceRegExp.lastIndex = offset;
const match = this.whitespaceRegExp.exec(text);
return {
Expand All @@ -210,8 +224,8 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
* @param offset Current position in the input string
* @returns The indentation token instance
*/
protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number) {
const lineNumber = text.substring(0, offset).split(/\r\n|\r|\n/).length;
protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number): IToken {
const lineNumber = this.getLineNumber(text, offset);
return createTokenInstance(
tokenType,
image,
Expand All @@ -221,6 +235,17 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
);
}

/**
* Helper function to get the line number at a given offset.
*
* @param text Full input string, used to calculate the line number
* @param offset Current position in the input string
* @returns The line number at the given offset
*/
protected getLineNumber(text: string, offset: number): number {
return text.substring(0, offset).split(/\r\n|\r|\n/).length;
}

/**
* A custom pattern for matching indents
*
Expand All @@ -229,14 +254,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
* @param tokens Previously scanned Tokens
* @param groups Token Groups
*/
protected indentMatcher(text: string, offset: number, tokens: IToken[], _groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
const { indentTokenName } = this.options;

if (!this.isStartOfLine(text, offset)) {
return null;
}

const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups);

if (currIndentLevel <= prevIndentLevel) {
// shallower indentation (should be matched by dedent)
Expand Down Expand Up @@ -266,14 +291,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
* @param tokens Previously scanned Tokens
* @param groups Token Groups
*/
protected dedentMatcher(text: string, offset: number, tokens: IToken[], _groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
const { dedentTokenName } = this.options;

if (!this.isStartOfLine(text, offset)) {
return null;
}

const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset);
const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups);

if (currIndentLevel >= prevIndentLevel) {
// bigger indentation (should be matched by indent)
Expand All @@ -285,9 +310,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key

// Any dedent must match some previous indentation level.
if (matchIndentIndex === -1) {
console.error(`Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indetation stack: ${this.indentationStack}`);
// throwing an error would crash the language server
// TODO: find a way to report error diagnostics message
this.diagnostics.push({
severity: 'error',
message: `Invalid dedent level ${currIndentLevel} at offset: ${offset}. Current indentation stack: ${this.indentationStack}`,
offset,
length: match?.[0]?.length ?? 0,
line: this.getLineNumber(text, offset),
column: 0
});
return null;
}

Expand Down Expand Up @@ -375,9 +405,11 @@ export class IndentationAwareLexer extends DefaultLexer {
override tokenize(text: string): LexerResult {
const result = super.tokenize(text);

// reset the indent stack between processing of different text inputs
const remainingDedents = this.indentationTokenBuilder.popRemainingDedents(text);
// consuming all remaining dedents and remove them as they might not be serializable
const report = result.report as IndentationLexingReport;
const remainingDedents = report.remainingDedents;
result.tokens.push(...remainingDedents);
report.remainingDedents = [];

// remove any "indent-dedent" pair with an empty body as these are typically
// added by comments or lines with just whitespace but have no real value
Expand Down
5 changes: 4 additions & 1 deletion packages/langium/src/parser/langium-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@ import { isAssignment, isCrossReference, isKeyword } from '../languages/generate
import { getExplicitRuleType, isDataTypeRule } from '../utils/grammar-utils.js';
import { assignMandatoryProperties, getContainerOfType, linkContentToContainer } from '../utils/ast-utils.js';
import { CstNodeBuilder } from './cst-node-builder.js';
import type { LexingReport } from './token-builder.js';

export type ParseResult<T = AstNode> = {
value: T,
parserErrors: IRecognitionException[],
lexerErrors: ILexingError[]
lexerErrors: ILexingError[],
lexerReport?: LexingReport
}

export const DatatypeSymbol = Symbol('Datatype');
Expand Down Expand Up @@ -240,6 +242,7 @@ export class LangiumParser extends AbstractLangiumParser {
return {
value: result,
lexerErrors: lexerResult.errors,
lexerReport: lexerResult.report,
parserErrors: this.wrapper.errors
};
}
Expand Down
11 changes: 8 additions & 3 deletions packages/langium/src/parser/lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain';
import type { LangiumCoreServices } from '../services.js';
import { Lexer as ChevrotainLexer } from 'chevrotain';
import type { LexingReport, TokenBuilder } from './token-builder.js';

export interface LexerResult {
/**
Expand All @@ -21,6 +22,7 @@ export interface LexerResult {
*/
hidden: IToken[];
errors: ILexingError[];
report?: LexingReport;
}

export interface Lexer {
Expand All @@ -31,10 +33,12 @@ export interface Lexer {
export class DefaultLexer implements Lexer {

protected chevrotainLexer: ChevrotainLexer;
protected tokenBuilder: TokenBuilder;
protected tokenTypes: TokenTypeDictionary;

constructor(services: LangiumCoreServices) {
const tokens = services.parser.TokenBuilder.buildTokens(services.Grammar, {
constructor( services: LangiumCoreServices) {
this.tokenBuilder = services.parser.TokenBuilder;
const tokens = this.tokenBuilder.buildTokens(services.Grammar, {
caseInsensitive: services.LanguageMetaData.caseInsensitive
});
this.tokenTypes = this.toTokenTypeDictionary(tokens);
Expand All @@ -53,7 +57,8 @@ export class DefaultLexer implements Lexer {
return {
tokens: chevrotainResult.tokens,
errors: chevrotainResult.errors,
hidden: chevrotainResult.groups.hidden ?? []
hidden: chevrotainResult.groups.hidden ?? [],
report: this.tokenBuilder.popLexingReport?.(text)
};
}

Expand Down
34 changes: 33 additions & 1 deletion packages/langium/src/parser/token-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
import type { CustomPatternMatcherFunc, ILexingError, TokenPattern, TokenType, TokenVocabulary } from 'chevrotain';
import type { AbstractRule, Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
import type { Stream } from '../utils/stream.js';
import { Lexer } from 'chevrotain';
Expand All @@ -20,9 +20,31 @@ export interface TokenBuilderOptions {

export interface TokenBuilder {
buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary;
/**
* Produces a lexing report for the given text that was just tokenized using the tokens provided by this builder.
*
* @param text The text that was tokenized.
*/
popLexingReport?(text: string): LexingReport;
}

/**
* A custom lexing report that can be produced by the token builder during the lexing process.
* Adopters need to ensure that the any custom fields are serializable so they can be sent across worker threads.
*/
export interface LexingReport {
diagnostics: LexingDiagnostic[];
}

export interface LexingDiagnostic extends ILexingError {
severity?: 'error' | 'warning' | 'info' | 'hint';
}

export class DefaultTokenBuilder implements TokenBuilder {
/**
* The list of diagnostics stored during the lexing process of a single text.
*/
protected diagnostics: LexingDiagnostic[] = [];

buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary {
const reachableRules = stream(getAllReachableRules(grammar, false));
Expand All @@ -42,6 +64,16 @@ export class DefaultTokenBuilder implements TokenBuilder {
return tokens;
}

popLexingReport(_text: string): LexingReport {
return { diagnostics: this.popDiagnostics() };
}

protected popDiagnostics(): LexingDiagnostic[] {
const diagnostics = [...this.diagnostics];
this.diagnostics = [];
return diagnostics;
}

protected buildTerminalTokens(rules: Stream<AbstractRule>): TokenType[] {
return rules.filter(isTerminalRule).filter(e => !e.fragment)
.map(terminal => this.buildTerminalToken(terminal)).toArray();
Expand Down
10 changes: 9 additions & 1 deletion packages/langium/src/serializer/hydrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { isRootCstNode, isCompositeCstNode, isLeafCstNode, isAstNode, isReferenc
import { streamAst } from '../utils/ast-utils.js';
import { BiMap } from '../utils/collections.js';
import { streamCst } from '../utils/cst-utils.js';
import type { LexingReport } from '../parser/token-builder.js';

/**
* The hydrator service is responsible for allowing AST parse results to be sent across worker threads.
Expand Down Expand Up @@ -61,14 +62,20 @@ export class DefaultHydrator implements Hydrator {

dehydrate(result: ParseResult<AstNode>): ParseResult<object> {
return {
lexerErrors: result.lexerErrors,
lexerReport: result.lexerReport ? this.dehydrateLexerReport(result.lexerReport) : undefined,
// We need to create shallow copies of the errors
// The original errors inherit from the `Error` class, which is not transferable across worker threads
lexerErrors: result.lexerErrors.map(e => ({ ...e, message: e.message })),
parserErrors: result.parserErrors.map(e => ({ ...e, message: e.message })),
value: this.dehydrateAstNode(result.value, this.createDehyrationContext(result.value))
};
}

protected dehydrateLexerReport(lexerReport: LexingReport): LexingReport {
// By default, lexer reports are serializable
return lexerReport;
}

protected createDehyrationContext(node: AstNode): DehydrateContext {
const astNodes = new Map<AstNode, any>();
const cstNodes = new Map<CstNode, any>();
Expand Down Expand Up @@ -162,6 +169,7 @@ export class DefaultHydrator implements Hydrator {
}
return {
lexerErrors: result.lexerErrors,
lexerReport: result.lexerReport,
parserErrors: result.parserErrors,
value: this.hydrateAstNode(node, context) as T
};
Expand Down
37 changes: 29 additions & 8 deletions packages/langium/src/validation/document-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { streamAst } from '../utils/ast-utils.js';
import { tokenToRange } from '../utils/cst-utils.js';
import { interruptAndCheck, isOperationCancelled } from '../utils/promise-utils.js';
import { diagnosticData } from './validation-registry.js';
import type { LexingDiagnostic } from '../parser/token-builder.js';

export interface ValidationOptions {
/**
Expand Down Expand Up @@ -97,21 +98,23 @@ export class DefaultDocumentValidator implements DocumentValidator {
}

protected processLexingErrors(parseResult: ParseResult, diagnostics: Diagnostic[], _options: ValidationOptions): void {
for (const lexerError of parseResult.lexerErrors) {
const lexerDiagnostics = [...parseResult.lexerErrors, ...parseResult.lexerReport?.diagnostics ?? []] as LexingDiagnostic[];
for (const lexerDiagnostic of lexerDiagnostics) {
const severity = lexerDiagnostic?.severity ?? 'error';
const diagnostic: Diagnostic = {
severity: toDiagnosticSeverity('error'),
severity: toDiagnosticSeverity(severity),
range: {
start: {
line: lexerError.line! - 1,
character: lexerError.column! - 1
line: lexerDiagnostic.line! - 1,
character: lexerDiagnostic.column! - 1
},
end: {
line: lexerError.line! - 1,
character: lexerError.column! + lexerError.length - 1
line: lexerDiagnostic.line! - 1,
character: lexerDiagnostic.column! + lexerDiagnostic.length - 1
}
},
message: lexerError.message,
data: diagnosticData(DocumentValidator.LexingError),
message: lexerDiagnostic.message,
data: toDiagnosticData(severity),
source: this.getSource()
};
diagnostics.push(diagnostic);
Expand Down Expand Up @@ -245,8 +248,26 @@ export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'h
}
}

export function toDiagnosticData(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticData {
switch (severity) {
case 'error':
return diagnosticData(DocumentValidator.LexingError);
case 'warning':
return diagnosticData(DocumentValidator.LexingWarning);
case 'info':
return diagnosticData(DocumentValidator.LexingInfo);
case 'hint':
return diagnosticData(DocumentValidator.LexingHint);
default:
throw new Error('Invalid diagnostic severity: ' + severity);
}
}

export namespace DocumentValidator {
export const LexingError = 'lexing-error';
export const LexingWarning = 'lexing-warning';
export const LexingInfo = 'lexing-info';
export const LexingHint = 'lexing-hint';
export const ParsingError = 'parsing-error';
export const LinkingError = 'linking-error';
}
Expand Down
Loading

0 comments on commit 51d99a6

Please sign in to comment.