Skip to content

Commit 6ba5cd6

Browse files
committed
Support character classes
1 parent 07a2213 commit 6ba5cd6

File tree

3 files changed

+113
-15
lines changed

3 files changed

+113
-15
lines changed

Sources/Regex2BNF/Regex2BNF.swift

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,26 @@ struct Regex2BNF: ParsableCommand {
6060

6161
// print("Grouping")
6262
try convert("a(b|c)d")
63+
try convert("a(?:b|c)d")
6364
try convert("a(bcd|def(g|h)+)z")
6465

6566
// print("Dot")
66-
// try convert(".*")
67-
// try convert("(a|b)*.{3}(a|b)")
67+
try convert(".*")
68+
try convert("(a|b)*.{3}(a|b)")
69+
70+
// print("Bultin character classes")
71+
try convert(#"\(\d{3}\)\d{3}-\d{4}"#)
72+
try convert(#"\s+keyword\s+"#)
6873

6974

7075
// print("[Done]")
76+
77+
// Look at optimizer output, the quant child is very long
78+
try convert("a(123456789)+b")
79+
80+
try convert("Hi the time right now is (AM|PM)")
81+
82+
try convert("a(b|c)*d{2,4}e?")
7183
}
7284
try convert(pattern)
7385

Sources/_RegexParser/Regex/BNF/BNF.swift

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
struct BNF {
1+
protocol BNFNode: CustomStringConvertible {
2+
func render() -> String
3+
}
4+
extension BNFNode {
5+
var description: String { render() }
6+
}
7+
8+
struct BNF: BNFNode {
29
var root: Rule
310
var rules: [Rule]
411

@@ -15,7 +22,7 @@ struct BNF {
1522
}
1623
}
1724

18-
struct Rule {
25+
struct Rule: BNFNode {
1926
// The left-hand side
2027
var symbol: NonTerminalSymbol
2128

@@ -33,23 +40,23 @@ struct CharacterPredicate {
3340
let impl: (Unicode.Scalar) -> Bool
3441
}
3542

36-
struct NonTerminalSymbol: Hashable {
43+
struct NonTerminalSymbol: Hashable, BNFNode {
3744
var name: String
3845

3946
func render() -> String {
40-
name
47+
"<\(name)>"
4148
}
4249
}
4350

44-
struct Expression {
51+
struct Expression: BNFNode {
4552
var choices: [Choice]
4653

4754
func render() -> String {
4855
"\(choices.map({ $0.render() }).joined(separator: " | "))"
4956
}
5057
}
5158

52-
struct Choice {
59+
struct Choice: BNFNode {
5360
var sequence: [Symbol]
5461

5562
init(_ symbols: Array<Symbol>) {
@@ -64,29 +71,64 @@ struct Choice {
6471
}
6572
}
6673

67-
enum Symbol {
74+
enum Symbol: BNFNode {
6875
case terminal(TerminalSymbol)
6976
case terminalSequence([TerminalSymbol])
7077
case nonTerminal(NonTerminalSymbol)
78+
case builtin(Builtin)
7179

7280
func render() -> String {
7381
switch self {
7482
case .terminal(let t):
7583
return t.render()
84+
7685
case .terminalSequence(let s):
7786
guard !s.isEmpty else {
7887
return "\"\""
7988
}
8089
return "\(s.map({ $0.render() }).joined(separator: " "))"
90+
8191
case .nonTerminal(let n):
8292
return n.render()
93+
94+
case .builtin(let b):
95+
return b.render()
96+
}
97+
}
98+
}
99+
100+
enum Builtin: BNFNode {
101+
case any // NOTE: we map dot to this, not sure if we want non-newline dots
102+
case whitespace
103+
case notWhitespace
104+
case decimalDigit
105+
case notDecimalDigit
106+
case wordCharacter
107+
case notWordCharacter
108+
109+
func render() -> String {
110+
switch self {
111+
case .any:
112+
return "<ALL_CHARACTERS_EXCEPT_QUOTE_AND_BACKSLASH>"
113+
case .whitespace:
114+
return "<WHITESPACES_AND_NEWLINES>"
115+
case .notWhitespace:
116+
fatalError()
117+
case .decimalDigit:
118+
return "<DECIMAL_DIGITS>"
119+
case .notDecimalDigit:
120+
fatalError()
121+
case .wordCharacter:
122+
return "<ALPHANUMERICS>"
123+
case .notWordCharacter:
124+
fatalError()
83125
}
84126
}
85127
}
86128

87129
enum CharacterSet {}
88130

89-
enum TerminalSymbol {
131+
enum TerminalSymbol: BNFNode {
90132
case character(Unicode.Scalar)
91133
case characterSet(CharacterSet)
92134
case utf8CodeUnit(UInt8)

Sources/_RegexParser/Regex/BNF/BNFConvert.swift

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,20 @@ extension BNFConvert {
157157

158158
return [s]
159159

160+
case .dot:
161+
return [.builtin(.any)]
162+
163+
case .escaped(let b):
164+
let builtin = try mapEscapedBuiltin(b)
165+
return [.builtin(builtin)]
166+
160167
case .scalar(_): fatalError()
161168
case .scalarSequence(_): fatalError()
162169
case .keyboardControl(_): fatalError()
163170
case .keyboardMeta(_): fatalError()
164171
case .keyboardMetaControl(_): fatalError()
165172

166-
case .property, .escaped, .dot, .caretAnchor, .dollarAnchor,
173+
case .property, .escaped, .caretAnchor, .dollarAnchor,
167174
.backreference, .subpattern, .namedCharacter, .callout,
168175
.backtrackingDirective, .changeMatchingOptions, .invalid:
169176
fatalError()
@@ -192,6 +199,40 @@ extension BNFConvert {
192199
}
193200

194201
extension BNFConvert {
202+
func mapEscapedBuiltin(_ b: AST.Atom.EscapedBuiltin) throws -> Builtin {
203+
switch b {
204+
205+
// Scalar escapes
206+
case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, .backspace:
207+
fatalError()
208+
209+
// Built-in character classes
210+
case .whitespace: return .whitespace
211+
case .notWhitespace: return .notWhitespace
212+
case .decimalDigit: return .decimalDigit
213+
case .notDecimalDigit: return .notDecimalDigit
214+
case .wordCharacter: return .wordCharacter
215+
case .notWordCharacter: return .notWordCharacter
216+
217+
// Other character classes
218+
case .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, .newlineSequence, .verticalTab, .notVerticalTab:
219+
fatalError()
220+
221+
222+
// Assertions
223+
case .wordBoundary, .notWordBoundary:
224+
fatalError()
225+
226+
// Anchors
227+
case .startOfSubject, .endOfSubjectBeforeNewline, .endOfSubject, .firstMatchingPositionInSubject:
228+
fatalError()
229+
230+
// Other
231+
case .singleDataUnit, .graphemeCluster, .resetStartOfMatch, .trueAnychar, .textSegment, .notTextSegment:
232+
fatalError()
233+
234+
}
235+
}
195236

196237
mutating func createQuantify(
197238
_ child: NonTerminalSymbol,
@@ -232,7 +273,7 @@ extension BNFConvert {
232273

233274
case .zeroOrOne:
234275
// QUANT ::= QUANT_CHILD | <empty>
235-
let name = symbols.genSym("QUANT_+")
276+
let name = symbols.genSym("QUANT_?")
236277
let choices = [
237278
Choice(child),
238279
emptyChoice
@@ -413,9 +454,12 @@ extension BNFConvert {
413454

414455
// TODO: This isn't a win when RHS already has uses
415456
if val.count == 1 {
416-
if case .nonTerminal(let rhs) = val.first!.sequence.first! {
417-
productions[rootSymbol] = productions[rhs]
418-
changed = true
457+
let seq = val.first!.sequence
458+
if seq.count == 1 {
459+
if case .nonTerminal(let rhs) = seq.first! {
460+
productions[rootSymbol] = productions[rhs]
461+
changed = true
462+
}
419463
}
420464
}
421465
}

0 commit comments

Comments
 (0)