Skip to content

Commit d79699f

Browse files
committed
Add conversion from regex syntax to BNF syntax
1 parent 3d2bdaa commit d79699f

File tree

5 files changed

+809
-0
lines changed

5 files changed

+809
-0
lines changed

Package.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ let package = Package(
135135
"_RegexParser",
136136
"_StringProcessing"
137137
]),
138+
.executableTarget(
139+
name: "Regex2BNF",
140+
dependencies: [
141+
.product(name: "ArgumentParser", package: "swift-argument-parser"),
142+
"_RegexParser"
143+
],
144+
swiftSettings: [availabilityDefinition]),
138145
.executableTarget(
139146
name: "RegexTester",
140147
dependencies: [

Sources/Regex2BNF/Regex2BNF.swift

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import ArgumentParser
13+
import _RegexParser
14+
15+
@main
16+
@available(SwiftStdlib 5.8, *)
17+
struct Regex2BNF: ParsableCommand {
18+
@Argument(help: "The regex pattern to convert to BNF.")
19+
var pattern: String
20+
21+
@Flag(
22+
name: [.customShort("e"), .customLong("examples")],
23+
help: "Run several examples")
24+
var runExamples = false
25+
26+
func convert(_ pattern: String) throws {
27+
print("/\(pattern)/\n")
28+
let bnf = try convertRegexToBNF(
29+
regex: pattern, namespace: "RE2BNF", version: 0)
30+
print(bnf)
31+
}
32+
33+
mutating func run() throws {
34+
if runExamples {
35+
// TODO: Turn into test cases
36+
// print("[Examples")
37+
38+
// print("Single-scalar character literals:")
39+
try convert("a")
40+
try convert("Z")
41+
try convert("")
42+
try convert("")
43+
try convert("\u{301}")
44+
45+
46+
// print("Multi-scalar character literals")
47+
try convert("🧟‍♀️")
48+
try convert("e\u{301}")
49+
50+
// print("Simple alternations")
51+
try convert("a|b")
52+
try convert("a|b|c|d")
53+
try convert("a|🧟‍♀️\u{301}日|z")
54+
55+
// print("Simple quantifications")
56+
try convert("a*")
57+
try convert("a+")
58+
try convert("a?")
59+
try convert("a{2,10}")
60+
try convert("a{,10}")
61+
try convert("a{2,}")
62+
63+
// print("Grouping")
64+
try convert("a(b|c)d")
65+
try convert("a(?:b|c)d")
66+
try convert("a(bcd|def(g|h)+)z")
67+
68+
// print("Dot")
69+
try convert(".*")
70+
try convert("(a|b)*.{3}(a|b)")
71+
72+
// print("Bultin character classes")
73+
try convert(#"\(\d{3}\)\d{3}-\d{4}"#)
74+
try convert(#"\s+keyword\s+"#)
75+
76+
77+
// print("[Done]")
78+
79+
// Look at optimizer output, the quant child is very long
80+
try convert("a(123456789)+b")
81+
82+
try convert("Hi the time right now is (AM|PM)")
83+
84+
try convert("a(b|c)*d{2,4}e?")
85+
}
86+
try convert(pattern)
87+
88+
89+
90+
}
91+
}
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
protocol BNFNode: CustomStringConvertible {
2+
func render() -> String
3+
}
4+
extension BNFNode {
5+
var description: String { render() }
6+
}
7+
8+
struct BNF: BNFNode {
9+
var root: Rule
10+
var rules: [Rule]
11+
12+
func render() -> String {
13+
var str = ""// root.render() + "\n"
14+
if rules.isEmpty {
15+
return str
16+
}
17+
return str
18+
+ rules.lazy.map {
19+
$0.render()
20+
}.joined(separator: "\n")
21+
+ "\n"
22+
}
23+
}
24+
25+
struct Rule: BNFNode {
26+
// The left-hand side
27+
var symbol: NonTerminalSymbol
28+
29+
var expression: Expression
30+
31+
var predicates: [CharacterPredicate] = []
32+
33+
func render() -> String {
34+
"\(symbol.render()) ::= \(expression.render())"
35+
}
36+
}
37+
38+
struct CharacterPredicate {
39+
// TODO: convention c or trivial?
40+
let impl: (Unicode.Scalar) -> Bool
41+
}
42+
43+
struct NonTerminalSymbol: Hashable, BNFNode {
44+
var name: String
45+
46+
func render() -> String {
47+
"<\(name)>"
48+
}
49+
}
50+
51+
struct Expression: BNFNode {
52+
var choices: [Choice]
53+
54+
func render() -> String {
55+
"\(choices.map({ $0.render() }).joined(separator: " | "))"
56+
}
57+
}
58+
59+
struct Choice: BNFNode {
60+
var sequence: [Symbol]
61+
62+
init(_ symbols: Array<Symbol>) {
63+
self.sequence = symbols
64+
}
65+
init(_ symbols: Symbol...) {
66+
self.init(symbols)
67+
}
68+
69+
func render() -> String {
70+
"\(sequence.map({ $0.render() }).joined(separator: " "))"
71+
}
72+
}
73+
74+
enum Symbol: BNFNode {
75+
case terminal(TerminalSymbol)
76+
case terminalSequence([TerminalSymbol])
77+
case nonTerminal(NonTerminalSymbol)
78+
case builtin(Builtin)
79+
80+
func render() -> String {
81+
switch self {
82+
case .terminal(let t):
83+
return t.render()
84+
85+
case .terminalSequence(let s):
86+
guard !s.isEmpty else {
87+
return "\"\""
88+
}
89+
return "\(s.map({ $0.render() }).joined(separator: " "))"
90+
91+
case .nonTerminal(let n):
92+
return n.render()
93+
94+
case .builtin(let b):
95+
return b.render()
96+
}
97+
}
98+
}
99+
100+
enum Builtin: BNFNode {
101+
case any // NOTE: we map dot to this, not sure if we want non-newline dots
102+
case whitespace
103+
case notWhitespace
104+
case decimalDigit
105+
case notDecimalDigit
106+
case wordCharacter
107+
case notWordCharacter
108+
109+
func render() -> String {
110+
switch self {
111+
case .any:
112+
return "<ALL_CHARACTERS_EXCEPT_QUOTE_AND_BACKSLASH>"
113+
case .whitespace:
114+
return "<WHITESPACES_AND_NEWLINES>"
115+
case .notWhitespace:
116+
fatalError()
117+
case .decimalDigit:
118+
return "<DECIMAL_DIGITS>"
119+
case .notDecimalDigit:
120+
fatalError()
121+
case .wordCharacter:
122+
return "<ALPHANUMERICS>"
123+
case .notWordCharacter:
124+
fatalError()
125+
}
126+
}
127+
}
128+
129+
enum CharacterSet {}
130+
131+
enum TerminalSymbol: BNFNode {
132+
case character(Unicode.Scalar)
133+
case characterSet(CharacterSet)
134+
case utf8CodeUnit(UInt8)
135+
136+
case characterPredicate(CharacterPredicate)
137+
138+
func render() -> String {
139+
switch self {
140+
case .character(let c):
141+
return "\"\(c)\""
142+
case .characterSet(let _):
143+
fatalError()
144+
case .utf8CodeUnit(let u):
145+
return "\"\(u)\""
146+
case .characterPredicate(_):
147+
fatalError()
148+
}
149+
}
150+
}
151+
152+
extension Expression {
153+
init(_ choices: [Choice]) {
154+
self.init(choices: choices)
155+
}
156+
init(_ choices: Choice...) {
157+
self.init(choices)
158+
}
159+
}
160+
161+
extension Choice {
162+
init(_ elements: [NonTerminalSymbol]) {
163+
self.init(elements.map { .nonTerminal($0) })
164+
}
165+
init(_ elements: NonTerminalSymbol...) {
166+
self.init(elements)
167+
}
168+
}
169+
170+
/*
171+
172+
173+
node -> choice
174+
175+
*/

0 commit comments

Comments
 (0)