From a2fc6e8e43e0829ef951c93b6dd4f30a9852fd9e Mon Sep 17 00:00:00 2001 From: Nikolay Rozhkov Date: Mon, 25 Sep 2023 01:38:28 +0300 Subject: [PATCH] Railroad: Treat empty string as epsilon, grammar reduction --- cSpell.json | 2 + .../src/diagrams/railroad/railroad.spec.ts | 55 +++++++-- .../src/diagrams/railroad/railroadDB.ts | 68 ++++++---- .../diagrams/railroad/railroadGrammar.jison | 116 +++++++++++------- 4 files changed, 165 insertions(+), 76 deletions(-) diff --git a/cSpell.json b/cSpell.json index 77ff903f78..e0d0728ae5 100644 --- a/cSpell.json +++ b/cSpell.json @@ -18,6 +18,7 @@ "bilkent", "bisheng", "blrs", + "bnf", "braintree", "brkt", "brolin", @@ -42,6 +43,7 @@ "dompurify", "dont", "doublecircle", + "ebnf", "edgechromium", "elems", "elkjs", diff --git a/packages/mermaid/src/diagrams/railroad/railroad.spec.ts b/packages/mermaid/src/diagrams/railroad/railroad.spec.ts index d79407f8bb..994fdba392 100644 --- a/packages/mermaid/src/diagrams/railroad/railroad.spec.ts +++ b/packages/mermaid/src/diagrams/railroad/railroad.spec.ts @@ -3,7 +3,7 @@ import railroad from './railroadGrammar.jison'; // import { prepareTextForParsing } from '../railroadUtils.js'; import { cleanupComments } from '../../diagram-api/comments.js'; import { db, Rule } from './railroadDB.js'; -// @ts-ignore: yaml +// @ts-ignore: yaml does not export types import defaultConfigJson from '../../schemas/config.schema.yaml?only-defaults=true'; describe('Railroad diagram', function () { @@ -19,28 +19,65 @@ describe('Railroad diagram', function () { describe('fails to parse', () => { test.each([ - ['', 'keyword missing'], + ['', 'keyword is missing'], + ['rule', 'assign operator is missing'], ['rule==id', 'assign operator is wrong'], - ['rule=id', '; missing'], + ['rule=id', 'semicolon is missing'], ['rule=(id;', 'parentheses are unbalanced'], ['rule=(id));', 'parentheses are unbalanced'], ["' ::= x;", 'rule is with quote is not wrapped in <>'], ["rule ::= ';", 'quote in rule definition is not wrapped in <>'], - ])('%s when %s', (grammar: string) => { + ])('`%s` where %s', (grammar: string) => { grammar = cleanupComments('' + grammar); expect(() => railroad.parser.parse(grammar)).toThrow(); }); }); describe('parses', () => { - describe('Simple samples', () => { + describe('assignment operators', () => { + // const grammarDefinition = prepareTextForParsing(cleanupComments('railroad-beta\n\n ' + data)); + test.each([ + ['rule ::= id;'], + ['rule := id;'], + ['rule : id;'], + ['rule => id;'], + ['rule = id;'], + ['rule -> id;'], + ])('`%s`', (grammar: string) => { + grammar = cleanupComments('railroad-beta' + grammar); + const grammarWithoutSpaces = grammar.replaceAll(' ', ''); + expect(() => railroad.parser.parse(grammar)).not.toThrow(); + expect(() => railroad.parser.parse(grammarWithoutSpaces)).not.toThrow(); + }); + }); + + describe('rules names', () => { + // const grammarDefinition = prepareTextForParsing(cleanupComments('railroad-beta\n\n ' + data)); + test.each([ + ['rule::=id;'], + ['::=id ;'], + ['::=id;'], + [`::=id;`], + [`::=id;`], + [`>::=id;`], + [` \\\\ \\x>::=id;`], + ])('`%s` produces', (grammar: string) => { + grammar = cleanupComments('railroad-beta' + grammar); + railroad.parser.parse(grammar); + const x = railroad.yy.getRules() as Rule[]; + console.log(x.map((r) => r.toEBNF())); + // expect(() => { railroad.parser.parse(grammar); }).not.toThrow(); + // railroad.parser.parse(grammar); + }); + }); + + describe('simple samples', () => { // const grammarDefinition = prepareTextForParsing(cleanupComments('railroad-beta\n\n ' + data)); test.each([ [''], - ['rule=;'], - ['rule::=;'], ['rule::=id;'], ['rule::=(id);'], + ['rule::=id-id;'], ['rule::=[id];'], ['rule::={id};'], ['rule::=id|id;'], @@ -58,7 +95,7 @@ describe('Railroad diagram', function () { ['<"> ::= <"">;'], [" ::= 'while' '(' ')' ;"], [" ::= 'while' '(' ')' ;"], - ])('%s', (grammar: string) => { + ])('`%s` produces', (grammar: string) => { grammar = cleanupComments('railroad-beta' + grammar); railroad.parser.parse(grammar); const x = railroad.yy.getRules() as Rule[]; @@ -88,7 +125,9 @@ describe('Railroad diagram', function () { railroad.parser.parse(grammar); }); }); + }); + describe('recognizes', function () { it('Arithmetic Expressions', () => { const grammar = ` railroad-beta diff --git a/packages/mermaid/src/diagrams/railroad/railroadDB.ts b/packages/mermaid/src/diagrams/railroad/railroadDB.ts index e27f155bf1..1c721ed2de 100644 --- a/packages/mermaid/src/diagrams/railroad/railroadDB.ts +++ b/packages/mermaid/src/diagrams/railroad/railroadDB.ts @@ -1,5 +1,4 @@ // import type { RailroadDB } from './railroadTypes.js'; -import { config } from 'process'; import * as configApi from '../../config.js'; import type { DiagramDB } from '../../diagram-api/types.js'; @@ -46,13 +45,13 @@ const getConsole = () => console; type Callback = (item: Chunk, index: number, parent: Chunk | undefined, result: T[]) => T; // type Traverse = (callback: Callback, index: number, parent?: Chunk) => T; -// interface Traversible { +// interface Traversable { // traverse(callback: Callback, index?: number, parent?: Chunk): T; // } // TODO: rewrite toEBNF using traverse // -// interface Chunk extends Traversible { +// interface Chunk extends Traversable { // toEBNF(): string; // } @@ -63,7 +62,7 @@ abstract class Chunk { abstract toEBNF(): string; } -class Leaf implements Chunk { +abstract class Leaf implements Chunk { constructor(public label: string) {} traverse(callback: Callback, index?: number, parent?: Chunk): T { @@ -71,9 +70,7 @@ class Leaf implements Chunk { return callback(this, index, parent, []); } - toEBNF(): string { - return this.label; - } + abstract toEBNF(): string; } abstract class Node implements Chunk { @@ -120,22 +117,43 @@ class Epsilon extends Leaf { constructor() { super('ɛ'); } + + toEBNF(): string { + return this.label; + } } -// remote quote??? class Term extends Leaf { - constructor(public label: string, public quote: string) { - super(label); + toEBNF(): string { + const escaped = this.label.replaceAll(/\\([\\'"])/g, "\\$1"); + + return '"' + escaped + '"'; } +} +class NonTerm extends Leaf { toEBNF(): string { - return this.quote + super.toEBNF() + this.quote; + const escaped = this.label.replaceAll(/\\([\\'"<>])/g, "\\$1"); + + return '<' + escaped + '>'; } } -class NonTerm extends Leaf { +class Exception implements Chunk { + constructor(public base: Chunk, public except: Chunk) {} + + traverse(callback: Callback, index?: number, parent?: Chunk): T { + index ??= 0; + const nested = [ + this.base.traverse(callback, 0, this), + this.except.traverse(callback, 1, this), + ] + + return callback(this, index, parent, nested); + } + toEBNF(): string { - return '<' + super.toEBNF() + '>'; + return `(${this.base.toEBNF()}) - ${this.except.toEBNF()}` } } @@ -172,13 +190,14 @@ class ZeroOrMany extends Node { } } -const addTerm = (label: string, quote: string): Chunk => { - return new Term(label, quote); +const addTerm = (label: string): Chunk => { + label.replaceAll(/\\(.)/g, "$1"); + + return new Term(label); }; const addNonTerm = (label: string): Chunk => { return new NonTerm(label); }; - const addZeroOrOne = (chunk: Chunk): Chunk => { return new ZeroOrOne(chunk); }; @@ -188,6 +207,9 @@ const addOneOrMany = (chunk: Chunk): Chunk => { const addZeroOrMany = (chunk: Chunk): Chunk => { return new ZeroOrMany(chunk); }; +const addException = (base: Chunk, except: Chunk): Chunk => { + return new Exception(base, except); +} const addRuleOrChoice = (ID: string, chunk: Chunk): void => { if (rules[ID]) { const value = rules[ID]; @@ -205,13 +227,12 @@ const addSequence = (chunks: Chunk[]): Chunk => { if (railroadConfig?.compress) { chunks = chunks - .map((chunk) => { + .flatMap((chunk) => { if (chunk instanceof Sequence) { return chunk.children; } return chunk; - }) - .flat(); + }); } if (chunks.length === 1) { @@ -228,13 +249,12 @@ const addChoice = (chunks: Chunk[]): Chunk => { if (configApi.getConfig().railroad?.compress) { chunks = chunks - .map((chunk) => { + .flatMap((chunk) => { if (chunk instanceof Choice) { return chunk.children; } return chunk; - }) - .flat(); + }); } if (chunks.length === 1) { @@ -259,9 +279,10 @@ export interface RailroadDB extends DiagramDB { addOneOrMany: (chunk: Chunk) => Chunk; addRuleOrChoice: (ID: string, chunk: Chunk) => void; addSequence: (chunks: Chunk[]) => Chunk; - addTerm: (label: string, quote: string) => Chunk; + addTerm: (label: string) => Chunk; addZeroOrMany: (chunk: Chunk) => Chunk; addZeroOrOne: (chunk: Chunk) => Chunk; + addException: (base: Chunk, except: Chunk) => Chunk; clear: () => void; getConsole: () => Console; getRules: () => Rule[]; @@ -277,6 +298,7 @@ export const db: RailroadDB = { addTerm, addZeroOrMany, addZeroOrOne, + addException, clear, getConfig: () => configApi.getConfig().railroad, getConsole, diff --git a/packages/mermaid/src/diagrams/railroad/railroadGrammar.jison b/packages/mermaid/src/diagrams/railroad/railroadGrammar.jison index e76b6a129b..268e223827 100644 --- a/packages/mermaid/src/diagrams/railroad/railroadGrammar.jison +++ b/packages/mermaid/src/diagrams/railroad/railroadGrammar.jison @@ -15,6 +15,8 @@ // Lexical analysis //------------------ +// this is told to be longest rules match, but I am not sure this works +// that is why the order in the assignment regexp matters %options flex %lex @@ -27,6 +29,7 @@ C_COLON \u003A // : C_SEMICOLON \u003B // ; C_VERTICAL_LINE \u007C // | C_SLASH \u002f // / +C_BACKSLASH \u005C // \ C_APOSTROPHE \u0027 // ' C_QUOTATION_MARK \u0022 // " C_LEFT_PARENTHESIS \u0028 // ( @@ -37,17 +40,17 @@ C_LEFT_CURLY_BRACKET \u007B // { C_RIGHT_CURLY_BRACKET \u007D // } C_LESS_THAN \u003C // < C_GREATER_THAN \u003E // > -C_QUANTIFIER [\u003F\u002B\u002A] // ?+* regexp-like -C_TEXTDATA [\u0020-\u0021\u0023-\u0026\u0028-\u003B\u003D\u003F-\u007E] // everything except ' " < > +C_ASTERISK \u002A +C_QUESTION_MARK \u003F +C_PLUS_SIGN \u002B +// TODO add classes for non symbols string symbols and quote symbols +C_TEXTDATA [\u0020-\u0021\u0023-\u0026\u0028-\u003B\u003D\u003F-\u005B\u005D-\u007E] // everything except ' " < > \ +C_EQUALS_SIGN \u003D // = // C_CR \u000D // C_LF \u000A // C_TAB \u0009 // C_VTAB \u000B -// EQUALS_SIGN \u003D // = -// QUESTION_MARK \u003F // ? -// PLUS_SIGN \u002B // + -// ASTERISK \u002A // * // EXCLAMATION_MARK \u0021 // ! // DEFINE \u003A\u003A\u003D // ::= // ASSIGN \u003A\u003D // := @@ -71,35 +74,41 @@ C_TEXTDATA [\u0020-\u0021\u0023-\u0026\u0028-\u003B\u003D\u003F-\u007E] // every // https://stackoverflow.com/questions/31862815/jison-lex-without-white-spaces // https://github.com/zaach/jison/wiki/Deviations-From-Flex-Bison -(({C_TEXTDATA}|{C_APOSTROPHE}|{C_QUOTATION_MARK})+) { return 'NONTERM' } -({C_GREATER_THAN}) { this.popState(); return '>' } +({C_BACKSLASH}?({C_TEXTDATA}|{C_APOSTROPHE}|{C_QUOTATION_MARK})|{C_BACKSLASH}({C_LESS_THAN}|{C_GREATER_THAN}|{C_BACKSLASH}))+ { return 'NONTERM' } +{C_GREATER_THAN} { this.popState(); return '>' } -(({C_TEXTDATA}|{C_LESS_THAN}|{C_GREATER_THAN}|{C_APOSTROPHE})+) { return 'QSTRING' } -({C_QUOTATION_MARK}) { this.popState(); return '"' } +// TODO add optional backslash +({C_TEXTDATA}|{C_LESS_THAN}|{C_GREATER_THAN}|{C_APOSTROPHE}|{C_BACKSLASH}{C_QUOTATION_MARK})+ { return 'QSTRING' } +{C_QUOTATION_MARK} { this.popState(); return '"' } -(({C_TEXTDATA}|{C_LESS_THAN}|{C_GREATER_THAN}|{C_QUOTATION_MARK})+) { return 'STRING' } -({C_APOSTROPHE}) { this.popState(); return 'APOSTROPHE' } +// TODO add optional backslash +({C_TEXTDATA}|{C_LESS_THAN}|{C_GREATER_THAN}|{C_QUOTATION_MARK}|{C_BACKSLASH}{C_APOSTROPHE})+ { return 'STRING' } +{C_APOSTROPHE} { this.popState(); return 'APOSTROPHE' } ("railroad-beta") { this.pushState('diag'); return 'railroad-beta' } -<*>([A-Za-z_][A-Za-z0-9_]*) { return 'IDENTIFIER' } -<*>({C_VERTICAL_LINE}|{C_SLASH}) { return '|' } -<*>({C_COMMA}) { return ',' } -<*>("::="|":="|":"|"="|"->") { return '=' } -<*>({C_SEMICOLON}|{C_DOT}) { return ';' } -<*>({C_LEFT_PARENTHESIS}) { return '(' } -<*>({C_RIGHT_PARENTHESIS}) { return ')' } -<*>({C_LEFT_SQUARE_BRACKET}) { return '[' } -<*>({C_RIGHT_SQUARE_BRACKET}) { return ']' } -<*>({C_LEFT_CURLY_BRACKET}) { return '{' } -<*>({C_RIGHT_CURLY_BRACKET}) { return '}' } -<*>({C_LESS_THAN}) { this.pushState('nonterm'); return '<' } -<*>({C_GREATER_THAN}) { return '>' } -<*>({C_QUOTATION_MARK}) { this.pushState('qstring'); return '"' } -<*>({C_APOSTROPHE}) { this.pushState('string'); return 'APOSTROPHE' } -<*>({C_QUANTIFIER}) { return 'QUANTIFIER' } +<*>[A-Za-z_][A-Za-z0-9_]* { return 'IDENTIFIER' } +<*>[0-9]|[1-9][0-9]+ { return 'NUMBER' } +<*>{C_VERTICAL_LINE}|{C_SLASH} { return '|' } +<*>{C_COMMA} { return ',' } +<*>"::="|":="|":"|"=>"|"="|"->" { return '=' } // assignment +<*>{C_SEMICOLON}|{C_DOT} { return ';' } +<*>{C_LEFT_PARENTHESIS} { return '(' } +<*>{C_RIGHT_PARENTHESIS} { return ')' } +<*>{C_LEFT_SQUARE_BRACKET} { return '[' } +<*>{C_RIGHT_SQUARE_BRACKET} { return ']' } +<*>{C_LEFT_CURLY_BRACKET} { return '{' } +<*>{C_RIGHT_CURLY_BRACKET} { return '}' } +<*>{C_HYPHEN} { return '-' } +<*>{C_ASTERISK} { return '*' } +<*>{C_PLUS_SIGN} { return '+' } +<*>{C_QUESTION_MARK} { return '?' } +<*>{C_LESS_THAN} { this.pushState('nonterm'); return '<' } +<*>{C_GREATER_THAN} { return '>' } +<*>{C_QUOTATION_MARK} { this.pushState('qstring'); return '"' } +<*>{C_APOSTROPHE} { this.pushState('string'); return 'APOSTROPHE' } <*><> { return 'EOF' } // match end of file -<*>(\s+) {} +<*>\s+ {} /lex @@ -231,10 +240,10 @@ choice alternatives : sequence "|" alternatives\[tail_] { - $$=[$sequence, ...$tail_]; + $$ = [$sequence, ...$tail_]; } | sequence { - $$=[$sequence]; + $$ = [$sequence]; } | { $$ = [yy.addEpsilon()]; @@ -242,22 +251,31 @@ alternatives ; sequence - : (fact ","?)+\[facts_] { - $$ = yy.addSequence(Object.values($facts_)); + : (item ","?)+\[items_] { + $$ = yy.addSequence(Object.values($items_)); } ; +item + : fact { $$ = $fact; } + | fact\[base_] '-' fact\[except_] { $$ = yy.addException($base_, $except_) } + ; + fact - : prim QUANTIFIER?\[quantifier_] { - switch($quantifier_) { - case '?': $$ = yy.addZeroOrOne($prim); break; - case '+': $$ = yy.addOneOrMany($prim); break; - case '*': $$ = yy.addZeroOrMany($prim); break; - default: $$ = $prim; - }; - } - | integer '*' prim { - + : prim '?' { + $$ = yy.addZeroOrOne($prim); + } + | prim '+' { + $$ = yy.addOneOrMany($prim); + } + | prim '*' { + $$ = yy.addZeroOrMany($prim); + } + | prim { + $$ = $prim; + } + | NUMBER '*' prim { + $$ = yy.addRepetitions($prim, $number_); } ; @@ -266,10 +284,18 @@ prim | '[' choice ']' { $$=yy.addZeroOrOne($choice); } | '{' choice '}' { $$=yy.addZeroOrMany($choice); } | '"' (QSTRING)?\[qstring_] '"' { - $$=yy.addTerm($qstring_, '"'); // TODO: add bunch of terminals instead of one? + if($qstring_) { + $$=yy.addTerm($qstring_); + } else { + $$=yy.addEpsilon(); + } } | APOSTROPHE (STRING)?\[string_] APOSTROPHE { - $$=yy.addTerm($string_, "'"); + if($string_) { + $$=yy.addTerm($string_); + } else { + $$=yy.addEpsilon(); + } } | non_term { $$=yy.addNonTerm($non_term); } ;