From 27587eca8ca778cdb650e2abb13ec162b9a0ae4a Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sat, 27 Jul 2024 21:17:48 +0300
Subject: [PATCH 1/6] Support ignoring indentation within delimiters

---
 .../langium/src/parser/indentation-aware.ts   |  39 +++++-
 .../test/parser/indentation-aware.test.ts     | 112 ++++++++++++++++--
 2 files changed, 138 insertions(+), 13 deletions(-)
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index f996d3674..9f9ffda15 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -4,7 +4,7 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
@@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      * @default 'WS'
      */
     whitespaceTokenName: TokenName;
+    /**
+     * The delimeter tokens inside of which indentation should be ignored and treated as normal whitespace.
+     * For example, Python doesn't treat any whitespace between `'('` and `')'` as significant.
+     *
+     * Note that this works only with terminal tokens, not keyword tokens,
+     * so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.
+     *
+     * @default []
+     */
+    ignoreIndentationDelimeters: Array<[begin: TokenName, end: TokenName]>
 }
 
 export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
     indentTokenName: 'INDENT',
     dedentTokenName: 'DEDENT',
     whitespaceTokenName: 'WS',
+    ignoreIndentationDelimeters: [],
 };
 
+export enum LexingMode {
+    REGULAR = 'indentation-sensitive',
+    IGNORE_INDENTATION = 'ignore-indentation',
+}
+
 /**
  * A token builder that is sensitive to indentation in the input text.
  * It will generate tokens for indentation and dedentation based on the indentation level.
@@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         if (!dedent || !indent || !ws) {
             throw new Error('Some indentation/whitespace tokens not found!');
         }
-        return [dedent, indent, ws, ...otherTokens];
+
+        const multiModeLexerDef: IMultiModeLexerDefinition = {
+            modes: {
+                [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
+                [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
+            },
+            defaultMode: LexingMode.REGULAR,
+        };
+
+        return multiModeLexerDef;
     }
 
     /**
@@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
 
     protected override buildTerminalToken(terminal: TerminalRule): TokenType {
         const tokenType = super.buildTerminalToken(terminal);
-        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;
 
         if (tokenType.name === indentTokenName) {
             return this.indentTokenType;
@@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
             });
         }
 
+        for (const [begin, end] of ignoreIndentationDelimeters) {
+            if (tokenType.name === begin) {
+                tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
+            } else if (tokenType.name === end) {
+                tokenType.POP_MODE = true;
+            }
+        }
+
         return tokenType;
     }
 
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 9afc89573..0839c5a71 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 import type { TokenType } from '@chevrotain/types';
-import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
+import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
 import { beforeEach, describe, expect, test } from 'vitest';
 import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
@@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder();
 
 async function getTokens(grammarString: string): Promise<TokenType[]> {
     const grammar = (await helper(grammarString)).parseResult.value;
-    return tokenBuilder.buildTokens(grammar) as TokenType[];
+    const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
+    return modes[defaultMode] as TokenType[];
 }
 
-async function getLexer(grammar: string): Promise<Lexer> {
-    const services = await createIndentationAwareServices(grammar);
+async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.Lexer;
 }
 
-async function getParser(grammar: string): Promise<LangiumParser> {
-    const services = await createIndentationAwareServices(grammar);
+async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.LangiumParser;
 }
 
-async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
+async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
     const services = await createServicesForGrammar({
         grammar,
         module: {
             parser: {
-                TokenBuilder: () => new IndentationAwareTokenBuilder(),
+                TokenBuilder: () => new IndentationAwareTokenBuilder(options),
                 Lexer: services => new IndentationAwareLexer(services)
             }
         } satisfies Module<LangiumServices, PartialLangiumServices>
@@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => {
 
         expect(tokenTypes).toHaveLength(5);
 
-        const [dedent, indent, ws] = tokenTypes;
+        const [dedent, indent] = tokenTypes;
         expect(dedent.name).toBe('DEDENT');
         expect(indent.name).toBe('INDENT');
-        expect(ws.name).toBe('WS');
     });
 
     test('Modifies indent/dedent patterns to be functions', async () => {
@@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => {
 
 });
 
+describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {
+
+    const grammar = `
+        grammar PythonIfWithLists
+
+        entry Statement: (If | Return)*;
+
+        If:
+            'if' condition=BOOLEAN ':'
+            INDENT thenBlock+=Statement+ DEDENT
+            ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;
+
+        Return: 'return' value=Expression;
+
+        Expression: List | Tuple | BOOLEAN;
+
+        Tuple: L_PAREN  (elements+=Expression (',' elements+=Expression)*)? R_PAREN;
+        List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET;
+
+        terminal BOOLEAN returns boolean: /true|false/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        terminal L_PAREN: '(';
+        terminal R_PAREN: ')';
+        terminal L_BRACKET: '[';
+        terminal R_BRACKET: ']';
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
+        hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
+    `;
+
+    const lexer = await getLexer(grammar, {
+        ignoreIndentationDelimeters: [
+            ['L_PAREN', 'R_PAREN'],
+            ['L_BRACKET', 'R_BRACKET'],
+        ],
+    });
+
+    test('should behave as usual without the given tokens in the input', async () => {
+        const { errors } = lexer.tokenize(expandToString`
+        if true:
+            return false
+        else:
+            return true
+        `);
+        expect(errors).toHaveLength(0);
+    });
+
+    test('should ignore indentation inside the given delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                false,
+            true, // including inconsitent indentation
+                    true
+            ]
+            return (true,
+                    false
+                   )
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+    test('should handle nested delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                [
+                    false,
+                    true
+                ],
+                    ([true,
+                    true],
+                    false)
+                [
+                    true
+                ]
+            ]
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+});
+
 describe('IndentationAware parsing', () => {
 
     const sampleGrammar = `

From fe3eb53ecdcdc60df9d5d7552d44ec3ff58e3a20 Mon Sep 17 00:00:00 2001
From: Abdelrahman Aly Abounegm <abounegm.abdelrahman@gmail.com>
Date: Fri, 9 Aug 2024 23:56:05 +0300
Subject: [PATCH 2/6] Fix typo

Co-authored-by: Mark Sujew <mark.sujew@typefox.io>
---
 packages/langium/src/parser/indentation-aware.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index 9f9ffda15..e95c56419 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -45,8 +45,8 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      */
     whitespaceTokenName: TokenName;
     /**
-     * The delimeter tokens inside of which indentation should be ignored and treated as normal whitespace.
-     * For example, Python doesn't treat any whitespace between `'('` and `')'` as significant.
+     * The delimiter tokens inside of which indentation should be ignored and treated as normal whitespace.
+     * For example, Python doesn't treat any whitespace between `(` and `)` as significant.
      *
      * Note that this works only with terminal tokens, not keyword tokens,
      * so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.

From a6359730460566477b23e96bfc9388459fdb9146 Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Fri, 9 Aug 2024 23:57:45 +0300
Subject: [PATCH 3/6] Extract tuple type into a type alias

---
 packages/langium/src/parser/indentation-aware.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index e95c56419..f8f516ad3 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -13,6 +13,8 @@ import { createToken, createTokenInstance, Lexer } from 'chevrotain';
 import { DefaultTokenBuilder } from './token-builder.js';
 import { DefaultLexer, isTokenTypeArray } from './lexer.js';
 
+type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName];
+
 export interface IndentationTokenBuilderOptions<TokenName extends string = string> {
     /**
      * The name of the token used to denote indentation in the grammar.
@@ -53,7 +55,7 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      *
      * @default []
      */
-    ignoreIndentationDelimeters: Array<[begin: TokenName, end: TokenName]>
+    ignoreIndentationDelimeters: Array<IndentationAwareDelimiter<TokenName>>
 }
 
 export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {

From 219a32e06c0b355121572e7b588c3116abd3601d Mon Sep 17 00:00:00 2001
From: Abdelrahman Abounegm <abounegm@yandex-team.ru>
Date: Sat, 10 Aug 2024 00:00:24 +0300
Subject: [PATCH 4/6] Support pushing insensitive mode by keyword tokens

---
 .../langium/src/parser/indentation-aware.ts   | 20 ++++++++++++++++---
 .../test/parser/indentation-aware.test.ts     | 12 ++++-------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index f8f516ad3..a791225f3 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
-import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
+import type { Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
 import type { TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
 import type { LangiumCoreServices } from '../services.js';
@@ -50,8 +50,7 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      * The delimiter tokens inside of which indentation should be ignored and treated as normal whitespace.
      * For example, Python doesn't treat any whitespace between `(` and `)` as significant.
      *
-     * Note that this works only with terminal tokens, not keyword tokens,
-     * so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.
+     * Can be either terminal tokens or keyword tokens.
      *
      * @default []
      */
@@ -322,6 +321,21 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         return tokenType;
     }
 
+    protected override buildKeywordToken(keyword: Keyword, terminalTokens: TokenType[], caseInsensitive: boolean): TokenType {
+        const tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive);
+        const { ignoreIndentationDelimeters } = this.options;
+
+        for (const [begin, end] of ignoreIndentationDelimeters) {
+            if (tokenType.name === begin) {
+                tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
+            } else if (tokenType.name === end) {
+                tokenType.POP_MODE = true;
+            }
+        }
+
+        return tokenType;
+    }
+
     /**
      * Resets the indentation stack between different runs of the lexer
      *
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 0839c5a71..34f685f4c 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -216,16 +216,12 @@ describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () =>
 
         Expression: List | Tuple | BOOLEAN;
 
-        Tuple: L_PAREN  (elements+=Expression (',' elements+=Expression)*)? R_PAREN;
-        List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET;
+        Tuple: '('  (elements+=Expression (',' elements+=Expression)*)? ')';
+        List: '[' (elements+=Expression (',' elements+=Expression)*)? ']';
 
         terminal BOOLEAN returns boolean: /true|false/;
         terminal INDENT: 'synthetic:indent';
         terminal DEDENT: 'synthetic:dedent';
-        terminal L_PAREN: '(';
-        terminal R_PAREN: ')';
-        terminal L_BRACKET: '[';
-        terminal R_BRACKET: ']';
         hidden terminal NL: /[\\r\\n]+/;
         hidden terminal WS: /[\\t ]+/;
         hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
@@ -233,8 +229,8 @@ describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () =>
 
     const lexer = await getLexer(grammar, {
         ignoreIndentationDelimeters: [
-            ['L_PAREN', 'R_PAREN'],
-            ['L_BRACKET', 'R_BRACKET'],
+            ['(', ')'],
+            ['[', ']'],
         ],
     });
 

From dcc0224b8683eff0c55a4d7f27933888a141f0b8 Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Wed, 21 Aug 2024 14:40:42 +0000
Subject: [PATCH 5/6] Deduplicate code

---
 .../langium/src/parser/indentation-aware.ts   | 58 +++++++------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index a791225f3..9921eb454 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
-import type { Grammar, Keyword, TerminalRule } from '../languages/generated/ast.js';
+import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
 import type { LangiumCoreServices } from '../services.js';
@@ -125,7 +125,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
             throw new Error('Invalid tokens built by default builder');
         }
 
-        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;
 
         // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
         // Order should be: dedent, indent, spaces
@@ -134,6 +134,13 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         let ws: TokenType | undefined;
         const otherTokens: TokenType[] = [];
         for (const tokenType of tokenTypes) {
+            for (const [begin, end] of ignoreIndentationDelimeters) {
+                if (tokenType.name === begin) {
+                    tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
+                } else if (tokenType.name === end) {
+                    tokenType.POP_MODE = true;
+                }
+            }
             if (tokenType.name === dedentTokenName) {
                 dedent = tokenType;
             } else if (tokenType.name === indentTokenName) {
@@ -148,15 +155,18 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
             throw new Error('Some indentation/whitespace tokens not found!');
         }
 
-        const multiModeLexerDef: IMultiModeLexerDefinition = {
-            modes: {
-                [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
-                [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
-            },
-            defaultMode: LexingMode.REGULAR,
-        };
-
-        return multiModeLexerDef;
+        if (ignoreIndentationDelimeters.length > 0) {
+            const multiModeLexerDef: IMultiModeLexerDefinition = {
+                modes: {
+                    [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
+                    [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
+                },
+                defaultMode: LexingMode.REGULAR,
+            };
+            return multiModeLexerDef;
+        } else {
+            return [dedent, indent, ws, ...otherTokens];
+        }
     }
 
     /**
@@ -296,7 +306,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
 
     protected override buildTerminalToken(terminal: TerminalRule): TokenType {
         const tokenType = super.buildTerminalToken(terminal);
-        const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
 
         if (tokenType.name === indentTokenName) {
             return this.indentTokenType;
@@ -309,30 +319,6 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
                 group: Lexer.SKIPPED,
             });
         }
-
-        for (const [begin, end] of ignoreIndentationDelimeters) {
-            if (tokenType.name === begin) {
-                tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
-            } else if (tokenType.name === end) {
-                tokenType.POP_MODE = true;
-            }
-        }
-
-        return tokenType;
-    }
-
-    protected override buildKeywordToken(keyword: Keyword, terminalTokens: TokenType[], caseInsensitive: boolean): TokenType {
-        const tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive);
-        const { ignoreIndentationDelimeters } = this.options;
-
-        for (const [begin, end] of ignoreIndentationDelimeters) {
-            if (tokenType.name === begin) {
-                tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
-            } else if (tokenType.name === end) {
-                tokenType.POP_MODE = true;
-            }
-        }
-
         return tokenType;
     }
 

From 536b63c5cdcb9a07b09e0ee31d09bb609a779456 Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Wed, 21 Aug 2024 14:46:35 +0000
Subject: [PATCH 6/6] Fix test

---
 packages/langium/test/parser/indentation-aware.test.ts | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 34f685f4c..b79f25708 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -20,8 +20,12 @@ const tokenBuilder = new IndentationAwareTokenBuilder();
 
 async function getTokens(grammarString: string): Promise<TokenType[]> {
     const grammar = (await helper(grammarString)).parseResult.value;
-    const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
-    return modes[defaultMode] as TokenType[];
+    const tokens = tokenBuilder.buildTokens(grammar);
+    if (Array.isArray(tokens)) {
+        return tokens;
+    } else {
+        return tokens.modes[tokens.defaultMode];
+    }
 }
 
 async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {