Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ignoring indentation within delimiters #1608

Merged
merged 6 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions packages/langium/src/parser/indentation-aware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* terms of the MIT License, which is available in the project root.
******************************************************************************/

import type { CustomPatternMatcherFunc, TokenType, IToken } from 'chevrotain';
import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain';
import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
import type { TokenBuilderOptions } from './token-builder.js';
import type { LexerResult } from './lexer.js';
Expand All @@ -13,6 +13,8 @@ import { createToken, createTokenInstance, Lexer } from 'chevrotain';
import { DefaultTokenBuilder } from './token-builder.js';
import { DefaultLexer, isTokenTypeArray } from './lexer.js';

type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName];

export interface IndentationTokenBuilderOptions<TokenName extends string = string> {
/**
* The name of the token used to denote indentation in the grammar.
Expand Down Expand Up @@ -44,14 +46,29 @@ export interface IndentationTokenBuilderOptions<TokenName extends string = strin
* @default 'WS'
*/
whitespaceTokenName: TokenName;
/**
* The delimiter tokens inside of which indentation should be ignored and treated as normal whitespace.
* For example, Python doesn't treat any whitespace between `(` and `)` as significant.
*
* Can be either terminal tokens or keyword tokens.
*
* @default []
*/
ignoreIndentationDelimeters: Array<IndentationAwareDelimiter<TokenName>>
}

export const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions = {
indentTokenName: 'INDENT',
dedentTokenName: 'DEDENT',
whitespaceTokenName: 'WS',
ignoreIndentationDelimeters: [],
};

export enum LexingMode {
REGULAR = 'indentation-sensitive',
IGNORE_INDENTATION = 'ignore-indentation',
}

/**
* A token builder that is sensitive to indentation in the input text.
* It will generate tokens for indentation and dedentation based on the indentation level.
Expand Down Expand Up @@ -108,7 +125,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
throw new Error('Invalid tokens built by default builder');
}

const { indentTokenName, dedentTokenName, whitespaceTokenName } = this.options;
const { indentTokenName, dedentTokenName, whitespaceTokenName, ignoreIndentationDelimeters } = this.options;

// Rearrange tokens because whitespace (which is ignored) goes to the beginning by default, consuming indentation as well
// Order should be: dedent, indent, spaces
Expand All @@ -117,6 +134,13 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
let ws: TokenType | undefined;
const otherTokens: TokenType[] = [];
for (const tokenType of tokenTypes) {
for (const [begin, end] of ignoreIndentationDelimeters) {
if (tokenType.name === begin) {
tokenType.PUSH_MODE = LexingMode.IGNORE_INDENTATION;
} else if (tokenType.name === end) {
tokenType.POP_MODE = true;
}
}
if (tokenType.name === dedentTokenName) {
dedent = tokenType;
} else if (tokenType.name === indentTokenName) {
Expand All @@ -130,7 +154,19 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
if (!dedent || !indent || !ws) {
throw new Error('Some indentation/whitespace tokens not found!');
}
return [dedent, indent, ws, ...otherTokens];

if (ignoreIndentationDelimeters.length > 0) {
const multiModeLexerDef: IMultiModeLexerDefinition = {
modes: {
[LexingMode.REGULAR]: [dedent, indent, ...otherTokens, ws],
[LexingMode.IGNORE_INDENTATION]: [...otherTokens, ws],
},
defaultMode: LexingMode.REGULAR,
};
return multiModeLexerDef;
} else {
return [dedent, indent, ws, ...otherTokens];
}
}

/**
Expand Down Expand Up @@ -283,7 +319,6 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string> ext
group: Lexer.SKIPPED,
});
}

return tokenType;
}

Expand Down
108 changes: 98 additions & 10 deletions packages/langium/test/parser/indentation-aware.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
******************************************************************************/

import type { TokenType } from '@chevrotain/types';
import type { AstNode, Grammar, LangiumParser, Lexer, Module } from 'langium';
import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium';
import { beforeEach, describe, expect, test } from 'vitest';
import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium';
import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
Expand All @@ -20,25 +20,26 @@

async function getTokens(grammarString: string): Promise<TokenType[]> {
const grammar = (await helper(grammarString)).parseResult.value;
return tokenBuilder.buildTokens(grammar) as TokenType[];
const { modes, defaultMode } = tokenBuilder.buildTokens(grammar);

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (windows-latest)

Property 'modes' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (windows-latest)

Property 'defaultMode' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium Lint

Property 'modes' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium Lint

Property 'defaultMode' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (ubuntu-latest)

Property 'modes' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.

Check failure on line 23 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (ubuntu-latest)

Property 'defaultMode' does not exist on type 'TokenType[] | IMultiModeLexerDefinition'.
return modes[defaultMode] as TokenType[];

Check failure on line 24 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (windows-latest)

packages/langium/test/parser/indentation-aware.test.ts > IndentationAwareTokenBuilder > Moves indent/dedent token types to the beginning

TypeError: Cannot read properties of undefined (reading 'undefined') ❯ getTokens packages/langium/test/parser/indentation-aware.test.ts:24:12 ❯ packages/langium/test/parser/indentation-aware.test.ts:68:28

Check failure on line 24 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (windows-latest)

packages/langium/test/parser/indentation-aware.test.ts > IndentationAwareTokenBuilder > Modifies indent/dedent patterns to be functions

TypeError: Cannot read properties of undefined (reading 'undefined') ❯ getTokens packages/langium/test/parser/indentation-aware.test.ts:24:12 ❯ packages/langium/test/parser/indentation-aware.test.ts:78:28

Check failure on line 24 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (ubuntu-latest)

packages/langium/test/parser/indentation-aware.test.ts > IndentationAwareTokenBuilder > Moves indent/dedent token types to the beginning

TypeError: Cannot read properties of undefined (reading 'undefined') ❯ getTokens packages/langium/test/parser/indentation-aware.test.ts:24:12 ❯ packages/langium/test/parser/indentation-aware.test.ts:68:28

Check failure on line 24 in packages/langium/test/parser/indentation-aware.test.ts

View workflow job for this annotation

GitHub Actions / Langium CI (ubuntu-latest)

packages/langium/test/parser/indentation-aware.test.ts > IndentationAwareTokenBuilder > Modifies indent/dedent patterns to be functions

TypeError: Cannot read properties of undefined (reading 'undefined') ❯ getTokens packages/langium/test/parser/indentation-aware.test.ts:24:12 ❯ packages/langium/test/parser/indentation-aware.test.ts:78:28
}

async function getLexer(grammar: string): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar);
async function getLexer(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<Lexer> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.Lexer;
}

async function getParser(grammar: string): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar);
async function getParser(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumParser> {
const services = await createIndentationAwareServices(grammar, options);
return services.parser.LangiumParser;
}

async function createIndentationAwareServices(grammar: string): Promise<LangiumServices> {
async function createIndentationAwareServices(grammar: string, options?: Partial<IndentationTokenBuilderOptions>): Promise<LangiumServices> {
const services = await createServicesForGrammar({
grammar,
module: {
parser: {
TokenBuilder: () => new IndentationAwareTokenBuilder(),
TokenBuilder: () => new IndentationAwareTokenBuilder(options),
Lexer: services => new IndentationAwareLexer(services)
}
} satisfies Module<LangiumServices, PartialLangiumServices>
Expand Down Expand Up @@ -68,10 +69,9 @@

expect(tokenTypes).toHaveLength(5);

const [dedent, indent, ws] = tokenTypes;
const [dedent, indent] = tokenTypes;
expect(dedent.name).toBe('DEDENT');
expect(indent.name).toBe('INDENT');
expect(ws.name).toBe('WS');
});

test('Modifies indent/dedent patterns to be functions', async () => {
Expand Down Expand Up @@ -200,6 +200,94 @@

});

describe('IndentationAwareTokenBuilder#ignoreIndentationDelimeters', async () => {

const grammar = `
grammar PythonIfWithLists

entry Statement: (If | Return)*;

If:
'if' condition=BOOLEAN ':'
INDENT thenBlock+=Statement+ DEDENT
('else' ':' INDENT elseBlock+=Statement+ DEDENT)?;

Return: 'return' value=Expression;

Expression: List | Tuple | BOOLEAN;

Tuple: '(' (elements+=Expression (',' elements+=Expression)*)? ')';
List: '[' (elements+=Expression (',' elements+=Expression)*)? ']';

terminal BOOLEAN returns boolean: /true|false/;
terminal INDENT: 'synthetic:indent';
terminal DEDENT: 'synthetic:dedent';
hidden terminal NL: /[\\r\\n]+/;
hidden terminal WS: /[\\t ]+/;
hidden terminal SL_COMMENT: /\\/\\/[^\\n\\r]*/;
`;

const lexer = await getLexer(grammar, {
ignoreIndentationDelimeters: [
['(', ')'],
['[', ']'],
],
});

test('should behave as usual without the given tokens in the input', async () => {
const { errors } = lexer.tokenize(expandToString`
if true:
return false
else:
return true
`);
expect(errors).toHaveLength(0);
});

test('should ignore indentation inside the given delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
false,
true, // including inconsitent indentation
true
]
return (true,
false
)
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

test('should handle nested delimeters', async () => {
const { errors, tokens } = lexer.tokenize(expandToString`
return [
[
false,
true
],
([true,
true],
false)
[
true
]
]
`);

expect(errors).toHaveLength(0);

const tokenNames = tokens.map(token => token.tokenType.name);
expect(tokenNames).not.toContain('INDENT');
expect(tokenNames).not.toContain('DEDENT');
});

});

describe('IndentationAware parsing', () => {

const sampleGrammar = `
Expand Down
Loading