Skip to content

Commit

Permalink
Support ignoring indentation within delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
aabounegm committed Jul 27, 2024
1 parent fad57e9 commit 27587ec
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 13 deletions.
39 changes: 36 additions & 3 deletions packages/langium/src/parser/indentation-aware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
import type { TokenBuilderOptions } from './token-builder.js';
import type { LexerResult } from './lexer.js';
Expand Down Expand Up @@ -44,14 +44,30 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
* @default 'WS'
*/
whitespaceTokenName: TokenName;
/**
* The delimeter tokens inside of which indentation should be ignored and treated as normal whitespace.
* For example, Python doesn't treat any whitespace between `'('` and `')'` as significant.
*
* Note that this works only with terminal tokens, not keyword tokens,
* so for `'('` you will have to define `terminal L_PAREN: /\(/;` and pass `'L_PAREN'` here.
*
* @default []
*/
ignoreIndentationDelimeters: Array<[begin: TokenName, end: TokenName]>
}

export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
indentTokenName: 'INDENT',
dedentTokenName: 'DEDENT',
whitespaceTokenName: 'WS',
ignoreIndentationDelimeters: [],
};

export enum LexingMode {
REGULAR = 'indentation-sensitive',
IGNORE_INDENTATION = 'ignore-indentation',
}

/**
* A token builder that is sensitive to indentation in the input text.
* It will generate tokens for indentation and dedentation based on the indentation level.
Expand Down Expand Up @@ -130,7 +146,16 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
if (!dedent || !indent || !ws) {
throw new Error('Some indentation/whitespace tokens not found!');
}
return [dedent, indent, ws, ...otherTokens];

const multiModeLexerDef: IMultiModeLexerDefinition = {
modes: {
[LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
[LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
},
defaultMode: LexingMode.REGULAR,
};

return multiModeLexerDef;
}

/**
Expand Down Expand Up @@ -270,7 +295,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext

protected override buildTerminalToken(terminal: TerminalRule): TokenType {
const tokenType = super.buildTerminalToken(terminal);
const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;

if (tokenType.name === indentTokenName) {
return this.indentTokenType;
Expand All @@ -284,6 +309,14 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
});
}

for (const [begin, end] of ignoreIndentationDelimeters) {
if (tokenType.name === begin) {
tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
} else if (tokenType.name === end) {
tokenType.POP_MODE = true;
}
}

return tokenType;
}

Expand Down
112 changes: 102 additions & 10 deletions packages/langium/test/parser/indentation-aware.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
******************************************************************************/

import type { TokenType } from '@chevrotain/types';
import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
import { beforeEach, describe, expect, test } from 'vitest';
import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
Expand All @@ -20,25 +20,26 @@ const tokenBuilder = new IndentationAwareTokenBuilder();

async function getTokens(grammarString: string): Promise<TokenType[]> {
const grammar = (await helper(grammarString)).parseResult.value;
return tokenBuilder.buildTokens(grammar) as TokenType[];
const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);
return modes[defaultMode] as TokenType[];
}

async function getLexer(grammar: string): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar);
async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.Lexer;
}

async function getParser(grammar: string): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar);
async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.LangiumParser;
}

async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
const services = await createServicesForGrammar({
grammar,
module: {
parser: {
TokenBuilder: () => new IndentationAwareTokenBuilder(),
TokenBuilder: () => new IndentationAwareTokenBuilder(options),
Lexer: services => new IndentationAwareLexer(services)
}
} satisfies Module<LangiumServices, PartialLangiumServices>
Expand Down Expand Up @@ -68,10 +69,9 @@ describe('IndentationAwareTokenBuilder', () => {

expect(tokenTypes).toHaveLength(5);

const [dedent, indent, ws] = tokenTypes;
const [dedent, indent] = tokenTypes;
expect(dedent.name).toBe('DEDENT');
expect(indent.name).toBe('INDENT');
expect(ws.name).toBe('WS');
});

test('Modifies indent/dedent patterns to be functions', async () => {
Expand Down Expand Up @@ -200,6 +200,98 @@ describe('IndentationAwareLexer', () => {

});

describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {

const grammar = `
grammar PythonIfWithLists
entry Statement: (If | Return)*;
If:
'if' condition=BOOLEAN ':'
INDENT thenBlock+=Statement+ DEDENT
('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;
Return: 'return' value=Expression;
Expression: List | Tuple | BOOLEAN;
Tuple: L_PAREN (elements+=Expression (',' elements+=Expression)*)? R_PAREN;
List: L_BRACKET (elements+=Expression (',' elements+=Expression)*)? R_BRACKET;
terminal BOOLEAN returns boolean: /true|false/;
terminal INDENT: 'synthetic:indent';
terminal DEDENT: 'synthetic:dedent';
terminal L_PAREN: '(';
terminal R_PAREN: ')';
terminal L_BRACKET: '[';
terminal R_BRACKET: ']';
hidden terminal NL: /[\\r\\n]+/;
hidden terminal WS: /[\\t ]+/;
hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
`;

const lexer = await getLexer(grammar, {
ignoreIndentationDelimeters: [
['L_PAREN', 'R_PAREN'],
['L_BRACKET', 'R_BRACKET'],
],
});

test('should behave as usual without the given tokens in the input', async () => {
const { errors } = lexer.tokenize(expandToString`
if true:
return false
else:
return true
`);
expect(errors).toHaveLength(0);
});

test('should ignore indentation inside the given delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
false,
true, // including inconsitent indentation
true
]
return (true,
false
)
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

test('should handle nested delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
[
false,
true
],
([true,
true],
false)
[
true
]
]
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

});

describe('IndentationAware parsing', () => {

const sampleGrammar = `
Expand Down

0 comments on commit 27587ec

Please sign in to comment.