eclipse-langium · msujew · Aug 21, 2024 · Jul 27, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
@@ -4,7 +4,7 @@
  * terms of the MIT License, which is available in the project root.
  ******************************************************************************/
 
-import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
+import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { TokenBuilderOptions } from './token-builder.js';
 import type { LexerResult } from './lexer.js';
@@ -13,6 +13,8 @@ import { createToken, createTokenInstance, Lexer } from 'chevrotain';
 import { DefaultTokenBuilder } from './token-builder.js';
 import { DefaultLexer, isTokenTypeArray } from './lexer.js';
 
+type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName];
+
 export interface IndentationTokenBuilderOptions<TokenName extends string = string> {
     /**
      * The name of the token used to denote indentation in the grammar.
@@ -44,14 +46,29 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
      * @default 'WS'
      */
     whitespaceTokenName: TokenName;
+    /**
+     * The delimiter tokens inside of which indentation should be ignored and treated as normal whitespace.
+     * For example, Python doesn't treat any whitespace between `(` and `)` as significant.
+     *
+     * Can be either terminal tokens or keyword tokens.
+     *
+     * @default []
+     */
+    ignoreIndentationDelimeters: Array<IndentationAwareDelimiter<TokenName>>
 }
 
 export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
     indentTokenName: 'INDENT',
     dedentTokenName: 'DEDENT',
     whitespaceTokenName: 'WS',
+    ignoreIndentationDelimeters: [],
 };
 
+export enum LexingMode {
+    REGULAR = 'indentation-sensitive',
+    IGNORE_INDENTATION = 'ignore-indentation',
+}
+
 /**
  * A token builder that is sensitive to indentation in the input text.
  * It will generate tokens for indentation and dedentation based on the indentation level.
@@ -108,7 +125,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
             throw new Error('Invalid tokens built by default builder');
         }
 
-        const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
+        const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;
 
         // Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
         // Order should be: dedent, indent, spaces
@@ -117,6 +134,13 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         let ws: TokenType | undefined;
         const otherTokens: TokenType[] = [];
         for (const tokenType of tokenTypes) {
+            for (const [begin, end] of ignoreIndentationDelimeters) {
+                if (tokenType.name === begin) {
+                    tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
+                } else if (tokenType.name === end) {
+                    tokenType.POP_MODE = true;
+                }
+            }
             if (tokenType.name === dedentTokenName) {
                 dedent = tokenType;
             } else if (tokenType.name === indentTokenName) {
@@ -130,7 +154,19 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
         if (!dedent || !indent || !ws) {
             throw new Error('Some indentation/whitespace tokens not found!');
         }
-        return [dedent, indent, ws, ...otherTokens];
+
+        if (ignoreIndentationDelimeters.length > 0) {
+            const multiModeLexerDef: IMultiModeLexerDefinition = {
+                modes: {
+                    [LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
+                    [LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
+                },
+                defaultMode: LexingMode.REGULAR,
+            };
+            return multiModeLexerDef;
+        } else {
+            return [dedent, indent, ws, ...otherTokens];
+        }
     }
 
     /**
@@ -283,7 +319,6 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
                 group: Lexer.SKIPPED,
             });
         }
-
         return tokenType;
     }
 

diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 import type { TokenType } from '@chevrotain/types';
-import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
+import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
 import { beforeEach, describe, expect, test } from 'vitest';
 import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
@@ -20,25 +20,26 @@
 
 async function getTokens(grammarString: string): Promise<TokenType[]> {
     const grammar = (await helper(grammarString)).parseResult.value;
-    return tokenBuilder.buildTokens(grammar) as TokenType[];
+    const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
+    return modes[defaultMode] as TokenType[];
 }
 
-async function getLexer(grammar: string): Promise<Lexer> {
-    const services = await createIndentationAwareServices(grammar);
+async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.Lexer;
 }
 
-async function getParser(grammar: string): Promise<LangiumParser> {
-    const services = await createIndentationAwareServices(grammar);
+async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
+    const services = await createIndentationAwareServices(grammar, options);
     return services.parser.LangiumParser;
 }
 
-async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
+async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
     const services = await createServicesForGrammar({
         grammar,
         module: {
             parser: {
-                TokenBuilder: () => new IndentationAwareTokenBuilder(),
+                TokenBuilder: () => new IndentationAwareTokenBuilder(options),
                 Lexer: services => new IndentationAwareLexer(services)
             }
         } satisfies Module<LangiumServices, PartialLangiumServices>
@@ -68,10 +69,9 @@
 
         expect(tokenTypes).toHaveLength(5);
 
-        const [dedent, indent, ws] = tokenTypes;
+        const [dedent, indent] = tokenTypes;
         expect(dedent.name).toBe('DEDENT');
         expect(indent.name).toBe('INDENT');
-        expect(ws.name).toBe('WS');
     });
 
     test('Modifies indent/dedent patterns to be functions', async () => {
@@ -200,6 +200,94 @@
 
 });
 
+describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {
+
+    const grammar = `
+        grammar PythonIfWithLists
+
+        entry Statement: (If | Return)*;
+
+        If:
+            'if' condition=BOOLEAN ':'
+            INDENT thenBlock+=Statement+ DEDENT
+            ('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;
+
+        Return: 'return' value=Expression;
+
+        Expression: List | Tuple | BOOLEAN;
+
+        Tuple: '('  (elements+=Expression (',' elements+=Expression)*)? ')';
+        List: '[' (elements+=Expression (',' elements+=Expression)*)? ']';
+
+        terminal BOOLEAN returns boolean: /true|false/;
+        terminal INDENT: 'synthetic:indent';
+        terminal DEDENT: 'synthetic:dedent';
+        hidden terminal NL: /[\\r\\n]+/;
+        hidden terminal WS: /[\\t ]+/;
+        hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
+    `;
+
+    const lexer = await getLexer(grammar, {
+        ignoreIndentationDelimeters: [
+            ['(', ')'],
+            ['[', ']'],
+        ],
+    });
+
+    test('should behave as usual without the given tokens in the input', async () => {
+        const { errors } = lexer.tokenize(expandToString`
+        if true:
+            return false
+        else:
+            return true
+        `);
+        expect(errors).toHaveLength(0);
+    });
+
+    test('should ignore indentation inside the given delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                false,
+            true, // including inconsitent indentation
+                    true
+            ]
+            return (true,
+                    false
+                   )
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+    test('should handle nested delimeters', async () => {
+        const { errors, tokens } = lexer.tokenize(expandToString`
+            return [
+                [
+                    false,
+                    true
+                ],
+                    ([true,
+                    true],
+                    false)
+                [
+                    true
+                ]
+            ]
+        `);
+
+        expect(errors).toHaveLength(0);
+
+        const tokenNames = tokens.map(token => token.tokenType.name);
+        expect(tokenNames).not.toContain('INDENT');
+        expect(tokenNames).not.toContain('DEDENT');
+    });
+
+});
+
 describe('IndentationAware parsing', () => {
 
     const sampleGrammar = `