Skip to content

Conversion of Regex syntax to BNF syntax #803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ let package = Package(
"_RegexParser",
"_StringProcessing"
]),
.executableTarget(
name: "Regex2BNF",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
"_RegexParser"
],
swiftSettings: [availabilityDefinition]),
.executableTarget(
name: "RegexTester",
dependencies: [
Expand Down
91 changes: 91 additions & 0 deletions Sources/Regex2BNF/Regex2BNF.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import ArgumentParser
import _RegexParser

@main
@available(SwiftStdlib 5.8, *)
struct Regex2BNF: ParsableCommand {
@Argument(help: "The regex pattern to convert to BNF.")
var pattern: String

@Flag(
name: [.customShort("e"), .customLong("examples")],
help: "Run several examples")
var runExamples = false

func convert(_ pattern: String) throws {
print("/\(pattern)/\n")
let bnf = try convertRegexToBNF(
regex: pattern, namespace: "RE2BNF", version: 0)
print(bnf)
}

mutating func run() throws {
if runExamples {
// TODO: Turn into test cases
// print("[Examples")

// print("Single-scalar character literals:")
try convert("a")
try convert("Z")
try convert("あ")
try convert("日")
try convert("\u{301}")


// print("Multi-scalar character literals")
try convert("🧟‍♀️")
try convert("e\u{301}")

// print("Simple alternations")
try convert("a|b")
try convert("a|b|c|d")
try convert("a|🧟‍♀️\u{301}日|z")

// print("Simple quantifications")
try convert("a*")
try convert("a+")
try convert("a?")
try convert("a{2,10}")
try convert("a{,10}")
try convert("a{2,}")

// print("Grouping")
try convert("a(b|c)d")
try convert("a(?:b|c)d")
try convert("a(bcd|def(g|h)+)z")

// print("Dot")
try convert(".*")
try convert("(a|b)*.{3}(a|b)")

// print("Bultin character classes")
try convert(#"\(\d{3}\)\d{3}-\d{4}"#)
try convert(#"\s+keyword\s+"#)


// print("[Done]")

// Look at optimizer output, the quant child is very long
try convert("a(123456789)+b")

try convert("Hi the time right now is (AM|PM)")

try convert("a(b|c)*d{2,4}e?")
}
try convert(pattern)



}
}
186 changes: 186 additions & 0 deletions Sources/_RegexParser/Regex/BNF/BNF.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021-2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

protocol BNFNode: CustomStringConvertible {
func render() -> String
}
extension BNFNode {
var description: String { render() }
}

struct BNF: BNFNode {
var root: Rule
var rules: [Rule]

func render() -> String {
var str = ""// root.render() + "\n"
if rules.isEmpty {
return str
}
return str
+ rules.lazy.map {
$0.render()
}.joined(separator: "\n")
+ "\n"
}
}

struct Rule: BNFNode {
// The left-hand side
var symbol: NonTerminalSymbol

var expression: Expression

var predicates: [CharacterPredicate] = []

func render() -> String {
"\(symbol.render()) ::= \(expression.render())"
}
}

struct CharacterPredicate {
// TODO: convention c or trivial?
let impl: (Unicode.Scalar) -> Bool
}

struct NonTerminalSymbol: Hashable, BNFNode {
var name: String

func render() -> String {
"<\(name)>"
}
}

struct Expression: BNFNode {
var choices: [Choice]

func render() -> String {
"\(choices.map({ $0.render() }).joined(separator: " | "))"
}
}

struct Choice: BNFNode {
var sequence: [Symbol]

init(_ symbols: Array<Symbol>) {
self.sequence = symbols
}
init(_ symbols: Symbol...) {
self.init(symbols)
}

func render() -> String {
"\(sequence.map({ $0.render() }).joined(separator: " "))"
}
}

enum Symbol: BNFNode {
case terminal(TerminalSymbol)
case terminalSequence([TerminalSymbol])
case nonTerminal(NonTerminalSymbol)
case builtin(Builtin)

func render() -> String {
switch self {
case .terminal(let t):
return t.render()

case .terminalSequence(let s):
guard !s.isEmpty else {
return "\"\""
}
return "\(s.map({ $0.render() }).joined(separator: " "))"

case .nonTerminal(let n):
return n.render()

case .builtin(let b):
return b.render()
}
}
}

enum Builtin: BNFNode {
case any // NOTE: we map dot to this, not sure if we want non-newline dots
case whitespace
case notWhitespace
case decimalDigit
case notDecimalDigit
case wordCharacter
case notWordCharacter

func render() -> String {
switch self {
case .any:
return "<ALL_CHARACTERS_EXCEPT_QUOTE_AND_BACKSLASH>"
case .whitespace:
return "<WHITESPACES_AND_NEWLINES>"
case .notWhitespace:
fatalError()
case .decimalDigit:
return "<DECIMAL_DIGITS>"
case .notDecimalDigit:
fatalError()
case .wordCharacter:
return "<ALPHANUMERICS>"
case .notWordCharacter:
fatalError()
}
}
}

enum CharacterSet {}

enum TerminalSymbol: BNFNode {
case character(Unicode.Scalar)
case characterSet(CharacterSet)
case utf8CodeUnit(UInt8)

case characterPredicate(CharacterPredicate)

func render() -> String {
switch self {
case .character(let c):
return "\"\(c)\""
case .characterSet(let _):
fatalError()
case .utf8CodeUnit(let u):
return "\"\(u)\""
case .characterPredicate(_):
fatalError()
}
}
}

extension Expression {
init(_ choices: [Choice]) {
self.init(choices: choices)
}
init(_ choices: Choice...) {
self.init(choices)
}
}

extension Choice {
init(_ elements: [NonTerminalSymbol]) {
self.init(elements.map { .nonTerminal($0) })
}
init(_ elements: NonTerminalSymbol...) {
self.init(elements)
}
}

/*


node -> choice

*/
Loading