diff --git a/Documentation/1. TallyType protocol.md b/Documentation/1. TallyType protocol.md index 18f9af6..1260c65 100644 --- a/Documentation/1. TallyType protocol.md +++ b/Documentation/1. TallyType protocol.md @@ -5,7 +5,7 @@ You can create your own tokenizers by implementing the `TallyType` protocol. ````Swift /// A tuple capturing information about a token match. /// -/// - tokenType: The instance of `TokenType` that matched the token. +/// - tokenizer: The instance of `TokenType` that matched the token. /// - text: The text that the token matched. /// - range: The range of the matched text in the original input. public typealias Token = (tokenType: TokenType, text: String, range: Range) diff --git a/Documentation/3. Expressive matching using enums.md b/Documentation/3. Expressive matching.md similarity index 68% rename from Documentation/3. Expressive matching using enums.md rename to Documentation/3. Expressive matching.md index 9bf7bec..5ac1f03 100644 --- a/Documentation/3. Expressive matching using enums.md +++ b/Documentation/3. Expressive matching.md @@ -1,11 +1,10 @@ -# Example: expressive matching using enums - +# Example: expressive matching tokens(from:) -> [Token] -The results returned by `tokens(from:)`returns an array of `Token` where `Token` is a typealias of the tuple `(tokenType: TokenType, text: String, range: Range)` +The results returned by `tokens(from:)`returns an array of `Token` where `Token` is a typealias of the tuple `(tokenizer: TokenType, text: String, range: Range)` -Which requires either type casting (using `as?`) type checking or type checking (using `is`) for the `tokenType` element to be useful: +Which requires either type casting (using `as?`) type checking or type checking (using `is`) for the `tokenizer` element to be useful: ````Swift import Mustard @@ -14,31 +13,28 @@ let messy = "123Hello world&^45.67" let tokens = messy.tokens(from: .decimalDigits, .letters) // using type checking -if tokens[0].tokenType is EmojiToken { +if tokens[0].tokenizer is EmojiToken { print("found emoji token") } // using type casting -if let _ = tokens[0].tokenType as? NumberToken { +if let _ = tokens[0].tokenizer as? NumberToken { print("found number token") } ```` -This can lead to bugs in your logic-- in the example above neither of the print statements will be executed since the TokenType used is actually the CharacterSet `extension`. +This can lead to bugs in your logic-- in the example above neither of the print statements will be executed since the tokenizer used was actually the character sets `.decimalDigits`, and `.letters`. -Mustard can return a strongly typed set of token matches if a single `TokenType` is used. +Mustard can return a strongly typed set of matches if a single `TokenType` is used. ````Swift import Mustard let messy = "123Hello world&^45.67" -// create typealias using the single `TokenType` to use -typealias NumberTokenMatch = (tokenType: NumberToken, text: String, range: Range) - // call `tokens()` method on string to get matching tokens from string -let numberTokens: [NumberTokenMatch] = messy.tokens() +let numberTokens: [NumberToken.Match] = messy.tokens() ```` @@ -93,19 +89,29 @@ enum MixedToken: TokenType { } ```` -Then use Mustard with this single `MixedType`, and the the `tokenType` element in matches can be -used without any casting, and the complier will give you an error if you attempt to use a type -that doesn't make sense. +Mustard defines a default typealias for `Token` that exposes the specific type in the +results tuple. + +````Swift +public extension TokenType { + typealias Token = (tokenizer: Self, text: String, range: Range) +} +```` + +Setting your results array to this type gives you the option to use the shorter `tokens()` method, +where Mustard uses the inferred type to perform tokenization. + +Since the matches array is strongly typed, you can be more expressive with the results, and the +complier can give you more hints to prevent you from making mistakes. ````Swift -// define your own type alias for your enum-based TokenType -typealias MixedMatch = (tokenType: MixedToken, text: String, range: Range) // use the `tokens()` method to grab tokens -let matches: [MixedMatch] = "123👩‍👩‍👦‍👦Hello world👶 again👶🏿 45.67".tokens() +let matches: [MixedToken.Match] = "123👩‍👩‍👦‍👦Hello world👶 again👶🏿 45.67".tokens() +// matches.count -> 8 matches.forEach({ match in - switch (match.token, match.text) { + switch (match.tokenizer, match.text) { case (.word, let word): print("word:", word) case (.number, let number): print("number:", number) case (.emoji, let emoji): print("emoji:", emoji) diff --git a/Documentation/4. Tokens with internal state.md b/Documentation/4. Tokens with internal state.md index 76ca91c..e8efd8d 100644 --- a/Documentation/4. Tokens with internal state.md +++ b/Documentation/4. Tokens with internal state.md @@ -108,27 +108,45 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool { return option.contains(input) } -class DateMatch: TokenType { +class DateToken: TokenType { - let template = "00/00/00" - var position: String.UnicodeScalarIndex + // private properties + private let _template = "00/00/00" + private var _position: String.UnicodeScalarIndex + private var _dateText: String + private var _date: Date? + // public property + var date: Date { + return _date! + } + + // formatters are expensive, so only instantiate once for all DateTokens + static let dateFormatter: DateFormatter = { + let dateFormatter = DateFormatter() + dateFormatter.dateFormat = "MM/dd/yy" + return dateFormatter + }() + + // called when we access `DateToken.tokenizer` required init() { - position = template.unicodeScalars.startIndex + _position = _template.unicodeScalars.startIndex + _dateText = "" } func canTake(_ scalar: UnicodeScalar) -> Bool { - guard position < template.unicodeScalars.endIndex else { + guard _position < _template.unicodeScalars.endIndex else { // we've matched all of the template return false } - switch (template.unicodeScalars[position], scalar) { + switch (_template.unicodeScalars[_position], scalar) { case ("\u{0030}", CharacterSet.decimalDigits), // match with a decimal digit ("\u{002F}", "\u{002F}"): // match with the '/' character - position = template.unicodeScalars.index(after: position) + _position = _template.unicodeScalars.index(after: _position) // increment the template position + _dateText.unicodeScalars.append(scalar) // add scalar to text matched so far return true default: @@ -137,25 +155,65 @@ class DateMatch: TokenType { } var isComplete: Bool { - return position == template.unicodeScalars.endIndex + if _position == _template.unicodeScalars.endIndex, + let date = DateToken.dateFormatter.date(from: _dateText) { + // we've reached the end of the template + // and the date text collected so far represents a valid + // date format (e.g. not 99/99/99) + + _date = date + return true + } + else { + return false + } } + // reset the tokenizer for matching new date func prepareForReuse() { - position = template.unicodeScalars.startIndex + _dateText = "" + _date = nil + _position = _template.unicodeScalars.startIndex + } + + // return an instance of tokenizer to return in matching tokens + // we return a copy so that the instance keeps reference to the + // dateText that has been matched, and the date that was parsed + var tokenizerForMatch: TokenType { + return DateToken(text: _dateText, date: _date) + } + + // only used by `tokenizerForMatch` + private init(text: String, date: Date?) { + _dateText = text + _date = date + _position = text.unicodeScalars.startIndex } } ```` -This will match tokens for any text that has the format of three pairs of numbers joined with the '/' character: +This will match tokens for any text that has the format of three pairs of numbers joined with the '/' character, but will also ignore characters that match that format, but don't form a valid date. + +Combined with the technique used in the [expressive matching example](Documentation/3. Expressive matching.md) where tokenizing using a single TokenType returns results of the actual type used, we can even access the `Date` object associated with the token. ````Swift import Mustard -let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned)" +let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/03/27 (Arrived) ref: 99/99/99" -let tokens = messyInput.tokens(from: DateMatch.tokenizer) -// tokens.count -> 1 +let tokens:[DateToken.Token] = messyInput.tokens() +// tokens.count -> 2 +// ('99/99/99' is not matched by `DateToken`) +// +// first date // tokens[0].text -> "12/01/27" +// tokens[0].tokenizer -> DateToken() +// tokens[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) +// +// last date +// tokens[1].text -> "12/03/27" +// tokens[1].tokenizer -> DateToken() +// tokens[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) ```` See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes fuzzy matching of a literal String, but ignoring certain characters. diff --git a/Mustard/Mustard/Mustard.swift b/Mustard/Mustard/Mustard.swift index 57a7ab0..04a141b 100644 --- a/Mustard/Mustard/Mustard.swift +++ b/Mustard/Mustard/Mustard.swift @@ -11,11 +11,11 @@ import Foundation public extension String { /// Returns tokens matching a single `TokenType` implied by the generic signature - func tokens() -> [(tokenType: T, text: String, range: Range)] { + func tokens() -> [(tokenizer: T, text: String, range: Range)] { return self.tokens(from: T()).flatMap({ - if let tokenType = $0.tokenType as? T { - return (tokenType: tokenType, text: $0.text, range: $0.range) + if let tokenType = $0.tokenizer as? T { + return (tokenizer: tokenType, text: $0.text, range: $0.range) } else { return nil } }) @@ -64,7 +64,7 @@ public extension String { // the token could be completed, so will add to matches matches.append( - (tokenType: token, + (tokenizer: token.tokenizerForMatch, text: text[start..) +public typealias Token = (tokenizer: TokenType, text: String, range: Range) public protocol TokenType { @@ -66,10 +66,19 @@ public protocol TokenType { /// Initialize an empty instance. init() + /// Returns a new instance of a token that's a copy of the reciever. + /// + /// The object returned is set as the `tokenizer` element from a call to `tokens()` + /// If the type implements NSCopying protocol, the default implementation returns the result of + /// `copy(with: nil)`; otherwise, returns self. + var tokenizerForMatch: TokenType { get } + } public extension TokenType { + typealias Token = (tokenizer: Self, text: String, range: Range) + static var tokenizer: TokenType { return Self() } func canStart(with scalar: UnicodeScalar) -> Bool { @@ -93,4 +102,13 @@ public extension TokenType { } func prepareForReuse() {} + + var tokenizerForMatch: TokenType { + if let copying = self as? NSCopying, let aCopy = copying.copy(with: nil) as? TokenType { + return aCopy + } + else { + return self + } + } } diff --git a/Mustard/MustardTests/CharacterSetTokenTests.swift b/Mustard/MustardTests/CharacterSetTokenTests.swift index c63e51b..fce4396 100644 --- a/Mustard/MustardTests/CharacterSetTokenTests.swift +++ b/Mustard/MustardTests/CharacterSetTokenTests.swift @@ -26,19 +26,19 @@ class CharacterSetTokenTests: XCTestCase { XCTAssert(tokens.count == 5, "Unexpected number of characterset tokens [\(tokens.count)]") - XCTAssert(tokens[0].tokenType == CharacterSet.decimalDigits) + XCTAssert(tokens[0].tokenizer == CharacterSet.decimalDigits) XCTAssert(tokens[0].text == "123") - XCTAssert(tokens[1].tokenType == CharacterSet.letters) + XCTAssert(tokens[1].tokenizer == CharacterSet.letters) XCTAssert(tokens[1].text == "Hello") - XCTAssert(tokens[2].tokenType == CharacterSet.letters) + XCTAssert(tokens[2].tokenizer == CharacterSet.letters) XCTAssert(tokens[2].text == "world") - XCTAssert(tokens[3].tokenType == CharacterSet.decimalDigits) + XCTAssert(tokens[3].tokenizer == CharacterSet.decimalDigits) XCTAssert(tokens[3].text == "45") - XCTAssert(tokens[4].tokenType == CharacterSet.decimalDigits) + XCTAssert(tokens[4].tokenizer == CharacterSet.decimalDigits) XCTAssert(tokens[4].text == "67") } diff --git a/Mustard/MustardTests/CustomTokenTests.swift b/Mustard/MustardTests/CustomTokenTests.swift index 9380373..c52841f 100644 --- a/Mustard/MustardTests/CustomTokenTests.swift +++ b/Mustard/MustardTests/CustomTokenTests.swift @@ -40,16 +40,16 @@ class CustomTokenTests: XCTestCase { XCTAssert(tokens.count == 4, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(tokens[0].tokenType is NumberToken) + XCTAssert(tokens[0].tokenizer is NumberToken) XCTAssert(tokens[0].text == "123") - XCTAssert(tokens[1].tokenType is WordToken) + XCTAssert(tokens[1].tokenizer is WordToken) XCTAssert(tokens[1].text == "Hello") - XCTAssert(tokens[2].tokenType is WordToken) + XCTAssert(tokens[2].tokenizer is WordToken) XCTAssert(tokens[2].text == "world") - XCTAssert(tokens[3].tokenType is NumberToken) + XCTAssert(tokens[3].tokenizer is NumberToken) XCTAssert(tokens[3].text == "45.67") } } diff --git a/Mustard/MustardTests/FuzzyMatchTokenTests.swift b/Mustard/MustardTests/FuzzyMatchTokenTests.swift index 37e0cef..75d3449 100644 --- a/Mustard/MustardTests/FuzzyMatchTokenTests.swift +++ b/Mustard/MustardTests/FuzzyMatchTokenTests.swift @@ -17,8 +17,8 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool { class FuzzyLiteralMatch: TokenType { let target: String - let exclusions: CharacterSet - var position: String.UnicodeScalarIndex + private let exclusions: CharacterSet + private var position: String.UnicodeScalarIndex required convenience init() { self.init(target: "", ignoring: CharacterSet.whitespaces) @@ -76,40 +76,86 @@ class FuzzyLiteralMatch: TokenType { } } -class DateMatch: TokenType { +class DateToken: TokenType { - let template = "00/00/00" - var position: String.UnicodeScalarIndex + // private properties + private let _template = "00/00/00" + private var _position: String.UnicodeScalarIndex + private var _dateText: String + private var _date: Date? + // public property + var date: Date { + return _date! + } + + // formatters are expensive, so only instantiate once for all DateTokens + static let dateFormatter: DateFormatter = { + let dateFormatter = DateFormatter() + dateFormatter.dateFormat = "MM/dd/yy" + return dateFormatter + }() + + // called when we access `DateToken.tokenizer` required init() { - position = template.unicodeScalars.startIndex + _position = _template.unicodeScalars.startIndex + _dateText = "" } func canTake(_ scalar: UnicodeScalar) -> Bool { - guard position < template.unicodeScalars.endIndex else { + guard _position < _template.unicodeScalars.endIndex else { // we've matched all of the template return false } - switch (template.unicodeScalars[position], scalar) { + switch (_template.unicodeScalars[_position], scalar) { case ("\u{0030}", CharacterSet.decimalDigits), // match with a decimal digit ("\u{002F}", "\u{002F}"): // match with the '/' character - position = template.unicodeScalars.index(after: position) + _position = _template.unicodeScalars.index(after: _position) // increment the template position + _dateText.unicodeScalars.append(scalar) // add scalar to text matched so far return true default: return false } } - + var isComplete: Bool { - return position == template.unicodeScalars.endIndex + if _position == _template.unicodeScalars.endIndex, + let date = DateToken.dateFormatter.date(from: _dateText) { + // we've reached the end of the template + // and the date text collected so far represents a valid + // date format (e.g. not 99/99/99) + + _date = date + return true + } + else { + return false + } } + // reset the tokenizer for matching new date func prepareForReuse() { - position = template.unicodeScalars.startIndex + _dateText = "" + _date = nil + _position = _template.unicodeScalars.startIndex + } + + // return an instance of tokenizer to return in matching tokens + // we return a copy so that the instance keeps reference to the + // dateText that has been matched, and the date that was parsed + var tokenizerForMatch: TokenType { + return DateToken(text: _dateText, date: _date) + } + + // only used by `tokenizerForMatch` + private init(text: String, date: Date?) { + _dateText = text + _date = date + _position = text.unicodeScalars.startIndex } } @@ -117,20 +163,40 @@ class FuzzyMatchTokenTests: XCTestCase { func testSpecialFormat() { - let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned)" + let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/02/27 (Arrived) ref: 99/99/99" let fuzzyTokenzier = FuzzyLiteralMatch(target: "#YF1942B", ignoring: CharacterSet.whitespaces.union(.punctuationCharacters)) - let tokens = messyInput.tokens(from: fuzzyTokenzier, DateMatch.tokenizer) + let tokens = messyInput.tokens(from: fuzzyTokenzier, DateToken.tokenizer) + + for token in tokens { + if let tokenier = token.tokenizer as? DateToken { + print(" - token.date: '\(tokenier.date)'") + } + } - XCTAssert(tokens.count == 2, "Unexpected number of tokens [\(tokens.count)]") + XCTAssert(tokens.count == 3, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(tokens[0].tokenType is FuzzyLiteralMatch) + XCTAssert(tokens[0].tokenizer is FuzzyLiteralMatch) XCTAssert(tokens[0].text == "#YF 1942-b") - XCTAssert(tokens[1].tokenType is DateMatch) + XCTAssert(tokens[1].tokenizer is DateToken) XCTAssert(tokens[1].text == "12/01/27") } + + func testDateMatches() { + + let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/02/27 (Arrived) ref: 99/99/99" + + let tokens: [DateToken.Token] = messyInput.tokens() + + for token in tokens { + print(" - token.date: '\(token.tokenizer.date)'") + } + + } } + + diff --git a/Mustard/MustardTests/LiteralTokenTests.swift b/Mustard/MustardTests/LiteralTokenTests.swift index 5918627..783d6cc 100644 --- a/Mustard/MustardTests/LiteralTokenTests.swift +++ b/Mustard/MustardTests/LiteralTokenTests.swift @@ -87,10 +87,10 @@ class LiteralTokenTests: XCTestCase { XCTAssert(tokens.count == 2, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(tokens[0].tokenType is LiteralToken) + XCTAssert(tokens[0].tokenizer is LiteralToken) XCTAssert(tokens[0].text == "cat") - XCTAssert(tokens[1].tokenType is LiteralToken) + XCTAssert(tokens[1].tokenizer is LiteralToken) XCTAssert(tokens[1].text == "duck") print(tokens.count) diff --git a/Mustard/MustardTests/MixedTokenTests.swift b/Mustard/MustardTests/MixedTokenTests.swift index 43d2064..ed96549 100644 --- a/Mustard/MustardTests/MixedTokenTests.swift +++ b/Mustard/MustardTests/MixedTokenTests.swift @@ -9,8 +9,6 @@ import XCTest import Mustard -typealias MixedMatch = (tokenType: MixedToken, text: String, range: Range) - enum MixedToken: TokenType { case word @@ -57,32 +55,32 @@ class MixedTokenTests: XCTestCase { func testMixedTokens() { - let tokens: [MixedMatch] = "123👩‍👩‍👦‍👦Hello world👶again👶🏿45.67".tokens() + let tokens: [MixedToken.Token] = "123👩‍👩‍👦‍👦Hello world👶again👶🏿45.67".tokens() XCTAssert(tokens.count == 8, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(tokens[0].tokenType == .number) + XCTAssert(tokens[0].tokenizer == .number) XCTAssert(tokens[0].text == "123") - XCTAssert(tokens[1].tokenType == .emoji) + XCTAssert(tokens[1].tokenizer == .emoji) XCTAssert(tokens[1].text == "👩‍👩‍👦‍👦") - XCTAssert(tokens[2].tokenType == .word) + XCTAssert(tokens[2].tokenizer == .word) XCTAssert(tokens[2].text == "Hello") - XCTAssert(tokens[3].tokenType == .word) + XCTAssert(tokens[3].tokenizer == .word) XCTAssert(tokens[3].text == "world") - XCTAssert(tokens[4].tokenType == .emoji) + XCTAssert(tokens[4].tokenizer == .emoji) XCTAssert(tokens[4].text == "👶") - XCTAssert(tokens[5].tokenType == .word) + XCTAssert(tokens[5].tokenizer == .word) XCTAssert(tokens[5].text == "again") - XCTAssert(tokens[6].tokenType == .emoji) + XCTAssert(tokens[6].tokenizer == .emoji) XCTAssert(tokens[6].text == "👶🏿") - XCTAssert(tokens[7].tokenType == .number) + XCTAssert(tokens[7].tokenizer == .number) XCTAssert(tokens[7].text == "45.67") } diff --git a/README.md b/README.md index c8de911..869b6be 100644 --- a/README.md +++ b/README.md @@ -16,26 +16,52 @@ let messy = "123Hello world&^45.67" let tokens = messy.tokens(from: .decimalDigits, .letters) // tokens.count -> 5 -// tokens: [(tokenType: TokenType, text: String, range: Range)] -// tokens is an array tuples that contains the TokenType that matched the token, -// the actual text that was matched, and the range of the token in the original input. +// tokens: [(tokenizer: TokenType, text: String, range: Range)] +// tokens is an array of tuples which contains an instance of the TokenType that +// matched the token, the actual text that was matched, and the range of the token +// in the original input. // // second token.. -// tokens[1].tokenType -> CharacterSet.letters +// tokens[1].tokenizer -> CharacterSet.letters // tokens[1].text -> "Hello" // tokens[1].range -> Range(3..<8) // // last token.. -// tokens[4].tokenType -> CharacterSet.decimalDigits +// tokens[4].tokenizer -> CharacterSet.decimalDigits // tokens[4].text -> "67" // tokens[4].range -> Range(19..<21) ```` +Creating by creating objects that implement the `TokenType` protocol we can create +more advanced tokenizers. Here's some usage of a `DateToken` type that matches tokens +with the a valid `MM/dd/yy` format, and also exposes a `date` property to access the +corresponding `Date` object. + +````Swift +import Mustard + +let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/03/27 (Arrived) ref: 99/99/99" + +let tokens:[DateToken.Token] = messyInput.tokens() +// tokens.count -> 2 +// ('99/99/99' is *not* matched by `DateToken`) +// +// first date +// tokens[0].text -> "12/01/27" +// tokens[0].tokenizer -> DateToken() +// tokens[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) +// +// last date +// tokens[1].text -> "12/03/27" +// tokens[1].tokenizer -> DateToken() +// tokens[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) +```` + ## More information - [TallyType protocol: implementing your own tokenizer](Documentation/1. TallyType protocol.md) - [Example: matching emoji](Documentation/2. Matching emoji.md) -- [Example: expressive matching using enums](Documentation/3. Expressive matching using enums.md) +- [Example: expressive matching](Documentation/3. Expressive matching.md) - [Example: literal and template matching using tokens with internal state](Documentation/4. Tokens with internal state.md) ## Todo (0.1)