From 27587eca8ca778cdb650e2abb13ec162b9a0ae4a Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sat, 27 Jul 2024 21:17:48 +0300 Subject: [PATCH] Support ignoring indentation within delimiters --- .../langium/src/parser/indentation-aware.ts | 39 +++++- .../test/parser/indentation-aware.test.ts | 112 ++++++++++++++++-- 2 files changed, 138 insertions(+), 13 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index f996d3674..9f9ffda15 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -4,7 +4,7 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain'; +import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; import type { TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; @@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions } export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { indentTokenName: 'INDENT', dedentTokenName: 'DEDENT', whitespaceTokenName: 'WS', + ignoreIndentationDelimeters: [], }; +export enum LexingMode { + REGULAR = 'indentation-sensitive', + IGNORE_INDENTATION = 'ignore-indentation', +} + /** * A token builder that is sensitive to indentation in the input text. * It will generate tokens for indentation and dedentation based on the indentation level. @@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder ext if (!dedent || !indent || !ws) { throw new Error('Some indentation/whitespace tokens not found!'); } - return [dedent, indent, ws, ...otherTokens]; + + const multiModeLexerDef: IMultiModeLexerDefinition = { + modes: { + [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws], + [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws], + }, + defaultMode: LexingMode.REGULAR, + }; + + return multiModeLexerDef; } /** @@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder ext protected override buildTerminalToken(terminal: TerminalRule): TokenType { const tokenType = super.buildTerminalToken(terminal); - const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options; if (tokenType.name === indentTokenName) { return this.indentTokenType; @@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder ext }); } + for (const [begin, end] of ignoreIndentationDelimeters) { + if (tokenType.name === begin) { + tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; + } else if (tokenType.name === end) { + tokenType.POP_MODE = true; + } + } + return tokenType; } diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 9afc89573..0839c5a71 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -5,7 +5,7 @@ ******************************************************************************/ import type { TokenType } from '@chevrotain/types'; -import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium'; +import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium'; import { beforeEach, describe, expect, test } from 'vitest'; import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; @@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder(); async function getTokens(grammarString: string): Promise { const grammar = (await helper(grammarString)).parseResult.value; - return tokenBuilder.buildTokens(grammar) as TokenType[]; + const { modes, defaultMode } = tokenBuilder.buildTokens(grammar); + return modes[defaultMode] as TokenType[]; } -async function getLexer(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getLexer(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.Lexer; } -async function getParser(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getParser(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.LangiumParser; } -async function createIndentationAwareServices(grammar: string): Promise { +async function createIndentationAwareServices(grammar: string, options?: Partial): Promise { const services = await createServicesForGrammar({ grammar, module: { parser: { - TokenBuilder: () => new IndentationAwareTokenBuilder(), + TokenBuilder: () => new IndentationAwareTokenBuilder(options), Lexer: services => new IndentationAwareLexer(services) } } satisfies Module @@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => { expect(tokenTypes).toHaveLength(5); - const [dedent, indent, ws] = tokenTypes; + const [dedent, indent] = tokenTypes; expect(dedent.name).toBe('DEDENT'); expect(indent.name).toBe('INDENT'); - expect(ws.name).toBe('WS'); }); test('Modifies indent/dedent patterns to be functions', async () => { @@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => { }); +describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => { + + const grammar = ` + grammar PythonIfWithLists + + entry Statement: (If | Return)*; + + If: + 'if' condition=BOOLEAN ':' + INDENT thenBlock+=Statement+ DEDENT + ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?; + + Return: 'return' value=Expression; + + Expression: List | Tuple | BOOLEAN; + + Tuple: L_PAREN (elements+=Expression (',' elements+=Expression)*)? R_PAREN; + List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET; + + terminal BOOLEAN returns boolean: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + terminal L_PAREN: '('; + terminal R_PAREN: ')'; + terminal L_BRACKET: '['; + terminal R_BRACKET: ']'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + `; + + const lexer = await getLexer(grammar, { + ignoreIndentationDelimeters: [ + ['L_PAREN', 'R_PAREN'], + ['L_BRACKET', 'R_BRACKET'], + ], + }); + + test('should behave as usual without the given tokens in the input', async () => { + const { errors } = lexer.tokenize(expandToString` + if true: + return false + else: + return true + `); + expect(errors).toHaveLength(0); + }); + + test('should ignore indentation inside the given delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + false, + true, // including inconsitent indentation + true + ] + return (true, + false + ) + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + + test('should handle nested delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + [ + false, + true + ], + ([true, + true], + false) + [ + true + ] + ] + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + +}); + describe('IndentationAware parsing', () => { const sampleGrammar = `