From 27587eca8ca778cdb650e2abb13ec162b9a0ae4a Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sat, 27 Jul 2024 21:17:48 +0300 Subject: [PATCH 1/6] Support ignoring indentation within delimiters --- .../langium/src/parser/indentation-aware.ts | 39 +++++- .../test/parser/indentation-aware.test.ts | 112 ++++++++++++++++-- 2 files changed, 138 insertions(+), 13 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index f996d3674..9f9ffda15 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -4,7 +4,7 @@ * terms of the MIT License, which is available in the project root. ******************************************************************************/ -import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain'; +import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; import type { TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; @@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions } export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { indentTokenName: 'INDENT', dedentTokenName: 'DEDENT', whitespaceTokenName: 'WS', + ignoreIndentationDelimeters: [], }; +export enum LexingMode { + REGULAR = 'indentation-sensitive', + IGNORE_INDENTATION = 'ignore-indentation', +} + /** * A token builder that is sensitive to indentation in the input text. * It will generate tokens for indentation and dedentation based on the indentation level. @@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder ext if (!dedent || !indent || !ws) { throw new Error('Some indentation/whitespace tokens not found!'); } - return [dedent, indent, ws, ...otherTokens]; + + const multiModeLexerDef: IMultiModeLexerDefinition = { + modes: { + [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws], + [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws], + }, + defaultMode: LexingMode.REGULAR, + }; + + return multiModeLexerDef; } /** @@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder ext protected override buildTerminalToken(terminal: TerminalRule): TokenType { const tokenType = super.buildTerminalToken(terminal); - const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options; if (tokenType.name === indentTokenName) { return this.indentTokenType; @@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder ext }); } + for (const [begin, end] of ignoreIndentationDelimeters) { + if (tokenType.name === begin) { + tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; + } else if (tokenType.name === end) { + tokenType.POP_MODE = true; + } + } + return tokenType; } diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 9afc89573..0839c5a71 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -5,7 +5,7 @@ ******************************************************************************/ import type { TokenType } from '@chevrotain/types'; -import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium'; +import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium'; import { beforeEach, describe, expect, test } from 'vitest'; import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; @@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder(); async function getTokens(grammarString: string): Promise { const grammar = (await helper(grammarString)).parseResult.value; - return tokenBuilder.buildTokens(grammar) as TokenType[]; + const { modes, defaultMode } = tokenBuilder.buildTokens(grammar); + return modes[defaultMode] as TokenType[]; } -async function getLexer(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getLexer(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.Lexer; } -async function getParser(grammar: string): Promise { - const services = await createIndentationAwareServices(grammar); +async function getParser(grammar: string, options?: Partial): Promise { + const services = await createIndentationAwareServices(grammar, options); return services.parser.LangiumParser; } -async function createIndentationAwareServices(grammar: string): Promise { +async function createIndentationAwareServices(grammar: string, options?: Partial): Promise { const services = await createServicesForGrammar({ grammar, module: { parser: { - TokenBuilder: () => new IndentationAwareTokenBuilder(), + TokenBuilder: () => new IndentationAwareTokenBuilder(options), Lexer: services => new IndentationAwareLexer(services) } } satisfies Module @@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => { expect(tokenTypes).toHaveLength(5); - const [dedent, indent, ws] = tokenTypes; + const [dedent, indent] = tokenTypes; expect(dedent.name).toBe('DEDENT'); expect(indent.name).toBe('INDENT'); - expect(ws.name).toBe('WS'); }); test('Modifies indent/dedent patterns to be functions', async () => { @@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => { }); +describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => { + + const grammar = ` + grammar PythonIfWithLists + + entry Statement: (If | Return)*; + + If: + 'if' condition=BOOLEAN ':' + INDENT thenBlock+=Statement+ DEDENT + ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?; + + Return: 'return' value=Expression; + + Expression: List | Tuple | BOOLEAN; + + Tuple: L_PAREN (elements+=Expression (',' elements+=Expression)*)? R_PAREN; + List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET; + + terminal BOOLEAN returns boolean: /true|false/; + terminal INDENT: 'synthetic:indent'; + terminal DEDENT: 'synthetic:dedent'; + terminal L_PAREN: '('; + terminal R_PAREN: ')'; + terminal L_BRACKET: '['; + terminal R_BRACKET: ']'; + hidden terminal NL: /[\\r\\n]+/; + hidden terminal WS: /[\\t ]+/; + hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; + `; + + const lexer = await getLexer(grammar, { + ignoreIndentationDelimeters: [ + ['L_PAREN', 'R_PAREN'], + ['L_BRACKET', 'R_BRACKET'], + ], + }); + + test('should behave as usual without the given tokens in the input', async () => { + const { errors } = lexer.tokenize(expandToString` + if true: + return false + else: + return true + `); + expect(errors).toHaveLength(0); + }); + + test('should ignore indentation inside the given delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + false, + true, // including inconsitent indentation + true + ] + return (true, + false + ) + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + + test('should handle nested delimeters', async () => { + const { errors, tokens } = lexer.tokenize(expandToString` + return [ + [ + false, + true + ], + ([true, + true], + false) + [ + true + ] + ] + `); + + expect(errors).toHaveLength(0); + + const tokenNames = tokens.map(token => token.tokenType.name); + expect(tokenNames).not.toContain('INDENT'); + expect(tokenNames).not.toContain('DEDENT'); + }); + +}); + describe('IndentationAware parsing', () => { const sampleGrammar = ` From fe3eb53ecdcdc60df9d5d7552d44ec3ff58e3a20 Mon Sep 17 00:00:00 2001 From: Abdelrahman Aly Abounegm Date: Fri, 9 Aug 2024 23:56:05 +0300 Subject: [PATCH 2/6] Fix typo Co-authored-by: Mark Sujew --- packages/langium/src/parser/indentation-aware.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index 9f9ffda15..e95c56419 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -45,8 +45,8 @@ export interface IndentationTokenBuilderOptions Date: Fri, 9 Aug 2024 23:57:45 +0300 Subject: [PATCH 3/6] Extract tuple type into a type alias --- packages/langium/src/parser/indentation-aware.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index e95c56419..f8f516ad3 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -13,6 +13,8 @@ import { createToken, createTokenInstance, Lexer } from 'chevrotain'; import { DefaultTokenBuilder } from './token-builder.js'; import { DefaultLexer, isTokenTypeArray } from './lexer.js'; +type IndentationAwareDelimiter = [begin: TokenName, end: TokenName]; + export interface IndentationTokenBuilderOptions { /** * The name of the token used to denote indentation in the grammar. @@ -53,7 +55,7 @@ export interface IndentationTokenBuilderOptions + ignoreIndentationDelimeters: Array> } export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = { From 219a32e06c0b355121572e7b588c3116abd3601d Mon Sep 17 00:00:00 2001 From: Abdelrahman Abounegm Date: Sat, 10 Aug 2024 00:00:24 +0300 Subject: [PATCH 4/6] Support pushing insensitive mode by keyword tokens --- .../langium/src/parser/indentation-aware.ts | 20 ++++++++++++++++--- .../test/parser/indentation-aware.test.ts | 12 ++++------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index f8f516ad3..a791225f3 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -5,7 +5,7 @@ ******************************************************************************/ import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; -import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; +import type { Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js'; import type { TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; import type { LangiumCoreServices } from '../services.js'; @@ -50,8 +50,7 @@ export interface IndentationTokenBuilderOptions ext return tokenType; } + protected override buildKeywordToken(keyword: Keyword, terminalTokens: TokenType[], caseInsensitive: boolean): TokenType { + const tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive); + const { ignoreIndentationDelimeters } = this.options; + + for (const [begin, end] of ignoreIndentationDelimeters) { + if (tokenType.name === begin) { + tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; + } else if (tokenType.name === end) { + tokenType.POP_MODE = true; + } + } + + return tokenType; + } + /** * Resets the indentation stack between different runs of the lexer * diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 0839c5a71..34f685f4c 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -216,16 +216,12 @@ describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => Expression: List | Tuple | BOOLEAN; - Tuple: L_PAREN (elements+=Expression (',' elements+=Expression)*)? R_PAREN; - List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET; + Tuple: '(' (elements+=Expression (',' elements+=Expression)*)? ')'; + List: '[' (elements+=Expression (',' elements+=Expression)*)? ']'; terminal BOOLEAN returns boolean: /true|false/; terminal INDENT: 'synthetic:indent'; terminal DEDENT: 'synthetic:dedent'; - terminal L_PAREN: '('; - terminal R_PAREN: ')'; - terminal L_BRACKET: '['; - terminal R_BRACKET: ']'; hidden terminal NL: /[\\r\\n]+/; hidden terminal WS: /[\\t ]+/; hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/; @@ -233,8 +229,8 @@ describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => const lexer = await getLexer(grammar, { ignoreIndentationDelimeters: [ - ['L_PAREN', 'R_PAREN'], - ['L_BRACKET', 'R_BRACKET'], + ['(', ')'], + ['[', ']'], ], }); From dcc0224b8683eff0c55a4d7f27933888a141f0b8 Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Wed, 21 Aug 2024 14:40:42 +0000 Subject: [PATCH 5/6] Deduplicate code --- .../langium/src/parser/indentation-aware.ts | 58 +++++++------------ 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index a791225f3..9921eb454 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -5,7 +5,7 @@ ******************************************************************************/ import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; -import type { Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js'; +import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; import type { TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; import type { LangiumCoreServices } from '../services.js'; @@ -125,7 +125,7 @@ export class IndentationAwareTokenBuilder ext throw new Error('Invalid tokens built by default builder'); } - const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options; // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well // Order should be: dedent, indent, spaces @@ -134,6 +134,13 @@ export class IndentationAwareTokenBuilder ext let ws: TokenType | undefined; const otherTokens: TokenType[] = []; for (const tokenType of tokenTypes) { + for (const [begin, end] of ignoreIndentationDelimeters) { + if (tokenType.name === begin) { + tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; + } else if (tokenType.name === end) { + tokenType.POP_MODE = true; + } + } if (tokenType.name === dedentTokenName) { dedent = tokenType; } else if (tokenType.name === indentTokenName) { @@ -148,15 +155,18 @@ export class IndentationAwareTokenBuilder ext throw new Error('Some indentation/whitespace tokens not found!'); } - const multiModeLexerDef: IMultiModeLexerDefinition = { - modes: { - [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws], - [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws], - }, - defaultMode: LexingMode.REGULAR, - }; - - return multiModeLexerDef; + if (ignoreIndentationDelimeters.length > 0) { + const multiModeLexerDef: IMultiModeLexerDefinition = { + modes: { + [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws], + [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws], + }, + defaultMode: LexingMode.REGULAR, + }; + return multiModeLexerDef; + } else { + return [dedent, indent, ws, ...otherTokens]; + } } /** @@ -296,7 +306,7 @@ export class IndentationAwareTokenBuilder ext protected override buildTerminalToken(terminal: TerminalRule): TokenType { const tokenType = super.buildTerminalToken(terminal); - const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options; + const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options; if (tokenType.name === indentTokenName) { return this.indentTokenType; @@ -309,30 +319,6 @@ export class IndentationAwareTokenBuilder ext group: Lexer.SKIPPED, }); } - - for (const [begin, end] of ignoreIndentationDelimeters) { - if (tokenType.name === begin) { - tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; - } else if (tokenType.name === end) { - tokenType.POP_MODE = true; - } - } - - return tokenType; - } - - protected override buildKeywordToken(keyword: Keyword, terminalTokens: TokenType[], caseInsensitive: boolean): TokenType { - const tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive); - const { ignoreIndentationDelimeters } = this.options; - - for (const [begin, end] of ignoreIndentationDelimeters) { - if (tokenType.name === begin) { - tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION; - } else if (tokenType.name === end) { - tokenType.POP_MODE = true; - } - } - return tokenType; } From 536b63c5cdcb9a07b09e0ee31d09bb609a779456 Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Wed, 21 Aug 2024 14:46:35 +0000 Subject: [PATCH 6/6] Fix test --- packages/langium/test/parser/indentation-aware.test.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index 34f685f4c..b79f25708 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -20,8 +20,12 @@ const tokenBuilder = new IndentationAwareTokenBuilder(); async function getTokens(grammarString: string): Promise { const grammar = (await helper(grammarString)).parseResult.value; - const { modes, defaultMode } = tokenBuilder.buildTokens(grammar); - return modes[defaultMode] as TokenType[]; + const tokens = tokenBuilder.buildTokens(grammar); + if (Array.isArray(tokens)) { + return tokens; + } else { + return tokens.modes[tokens.defaultMode]; + } } async function getLexer(grammar: string, options?: Partial): Promise {