Update to allow tokens with internal state to be returned as token re…

…sults.
mathewsanders · Jan 1, 2017 · 8c48f8a · 8c48f8a
1 parent 4ae7667
commit 8c48f8a
Show file tree

Hide file tree

Showing 11 changed files with 255 additions and 83 deletions.
diff --git a/Documentation/1. TallyType protocol.md b/Documentation/1. TallyType protocol.md
@@ -5,7 +5,7 @@ You can create your own tokenizers by implementing the `TallyType` protocol.
 ````Swift
 /// A tuple capturing information about a token match.
 ///
-/// - tokenType: The instance of `TokenType` that matched the token.
+/// - tokenizer: The instance of `TokenType` that matched the token.
 /// - text: The text that the token matched.
 /// - range: The range of the matched text in the original input.
 public typealias Token = (tokenType: TokenType, text: String, range: Range<String.Index>)

diff --git a/...ion/3. Expressive matching using enums.md → Documentation/3. Expressive matching.md b/...ion/3. Expressive matching using enums.md → Documentation/3. Expressive matching.md
@@ -1,11 +1,10 @@
-# Example: expressive matching using enums
-
+# Example: expressive matching
 
 tokens(from:) -> [Token]
 
-The results returned by `tokens(from:)`returns an array of `Token` where `Token` is a typealias of the tuple  `(tokenType: TokenType, text: String, range: Range<String.Index>)`
+The results returned by `tokens(from:)`returns an array of `Token` where `Token` is a typealias of the tuple  `(tokenizer: TokenType, text: String, range: Range<String.Index>)`
 
-Which requires either type casting (using `as?`) type checking  or type checking (using `is`) for the `tokenType` element to be useful:
+Which requires either type casting (using `as?`) type checking  or type checking (using `is`) for the `tokenizer` element to be useful:
 
 ````Swift
 import Mustard
@@ -14,31 +13,28 @@ let messy = "123Hello world&^45.67"
 let tokens = messy.tokens(from: .decimalDigits, .letters)
 
 // using type checking
-if tokens[0].tokenType is EmojiToken {
+if tokens[0].tokenizer is EmojiToken {
     print("found emoji token")
 }
 
 // using type casting
-if let _ = tokens[0].tokenType as? NumberToken {
+if let _ = tokens[0].tokenizer as? NumberToken {
     print("found number token")
 }
 
 ````
 
-This can lead to bugs in your logic-- in the example above neither of the print statements will be executed since the TokenType used is actually the CharacterSet `extension`.
+This can lead to bugs in your logic-- in the example above neither of the print statements will be executed since the tokenizer used was actually the character sets `.decimalDigits`, and `.letters`.
 
-Mustard can return a strongly typed set of token matches if a single `TokenType` is used.
+Mustard can return a strongly typed set of matches if a single `TokenType` is used.
 
 ````Swift
 import Mustard
 
 let messy = "123Hello world&^45.67"
 
-// create typealias using the single `TokenType` to use
-typealias NumberTokenMatch = (tokenType: NumberToken, text: String, range: Range<String.Index>)
-
 // call `tokens()` method on string to get matching tokens from string
-let numberTokens: [NumberTokenMatch] = messy.tokens()
+let numberTokens: [NumberToken.Match] = messy.tokens()
 
 ````
 
@@ -93,19 +89,29 @@ enum MixedToken: TokenType {
 }
 ````
 
-Then use Mustard with this single `MixedType`, and the the `tokenType` element in matches can be
-used without any casting, and the complier will give you an error if you attempt to use a type
-that doesn't make sense.
+Mustard defines a default typealias for `Token` that exposes the specific type in the
+results tuple.
+
+````Swift
+public extension TokenType {
+    typealias Token = (tokenizer: Self, text: String, range: Range<String.Index>)
+}
+````
+
+Setting your results array to this type gives you the option to use the shorter `tokens()` method,
+where Mustard uses the inferred type to perform tokenization.
+
+Since the matches array is strongly typed, you can be more expressive with the results, and the
+complier can give you more hints to prevent you from making mistakes.
 
 ````Swift
-// define your own type alias for your enum-based TokenType
-typealias MixedMatch = (tokenType: MixedToken, text: String, range: Range<String.Index>)
 
 // use the `tokens()` method to grab tokens
-let matches: [MixedMatch] = "123👩‍👩‍👦‍👦Hello world👶 again👶🏿 45.67".tokens()
+let matches: [MixedToken.Match] = "123👩‍👩‍👦‍👦Hello world👶 again👶🏿 45.67".tokens()
+// matches.count -> 8
 
 matches.forEach({ match in
-    switch (match.token, match.text) {
+    switch (match.tokenizer, match.text) {
     case (.word, let word): print("word:", word)
     case (.number, let number): print("number:", number)
     case (.emoji, let emoji): print("emoji:", emoji)

diff --git a/Documentation/4. Tokens with internal state.md b/Documentation/4. Tokens with internal state.md
@@ -108,27 +108,45 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool {
     return option.contains(input)
 }
 
-class DateMatch: TokenType {
+class DateToken: TokenType {
 
-    let template = "00/00/00"
-    var position: String.UnicodeScalarIndex
+    // private properties
+    private let _template = "00/00/00"
+    private var _position: String.UnicodeScalarIndex
+    private var _dateText: String
+    private var _date: Date?
 
+    // public property
+    var date: Date {
+        return _date!
+    }
+
+    // formatters are expensive, so only instantiate once for all DateTokens
+    static let dateFormatter: DateFormatter = {
+        let dateFormatter = DateFormatter()
+        dateFormatter.dateFormat = "MM/dd/yy"
+        return dateFormatter
+    }()
+
+    // called when we access `DateToken.tokenizer`
     required init() {
-        position = template.unicodeScalars.startIndex
+        _position = _template.unicodeScalars.startIndex
+        _dateText = ""
     }
 
     func canTake(_ scalar: UnicodeScalar) -> Bool {
 
-        guard position < template.unicodeScalars.endIndex else {
+        guard _position < _template.unicodeScalars.endIndex else {
             // we've matched all of the template
             return false
         }
 
-        switch (template.unicodeScalars[position], scalar) {
+        switch (_template.unicodeScalars[_position], scalar) {
         case ("\u{0030}", CharacterSet.decimalDigits), // match with a decimal digit
              ("\u{002F}", "\u{002F}"):                 // match with the '/' character
 
-            position = template.unicodeScalars.index(after: position)
+            _position = _template.unicodeScalars.index(after: _position) // increment the template position
+            _dateText.unicodeScalars.append(scalar) // add scalar to text matched so far
             return true
 
         default:
@@ -137,25 +155,65 @@ class DateMatch: TokenType {
     }
 
     var isComplete: Bool {
-        return position == template.unicodeScalars.endIndex
+        if _position == _template.unicodeScalars.endIndex,
+            let date = DateToken.dateFormatter.date(from: _dateText) {
+            // we've reached the end of the template
+            // and the date text collected so far represents a valid
+            // date format (e.g. not 99/99/99)
+
+            _date = date
+            return true
+        }
+        else {
+            return false
+        }
     }
 
+    // reset the tokenizer for matching new date
     func prepareForReuse() {
-        position = template.unicodeScalars.startIndex
+        _dateText = ""
+        _date = nil
+        _position = _template.unicodeScalars.startIndex
+    }
+
+    // return an instance of tokenizer to return in matching tokens
+    // we return a copy so that the instance keeps reference to the
+    // dateText that has been matched, and the date that was parsed
+    var tokenizerForMatch: TokenType {
+        return DateToken(text: _dateText, date: _date)
+    }
+
+    // only used by `tokenizerForMatch`
+    private init(text: String, date: Date?) {
+        _dateText = text
+        _date = date
+        _position = text.unicodeScalars.startIndex
     }
 }
 ````
 
-This will match tokens for any text that has the format of three pairs of numbers joined with the '/' character:
+This will match tokens for any text that has the format of three pairs of numbers joined with the '/' character, but will also ignore characters that match that format, but don't form a valid date.
+
+Combined with the technique used in the [expressive matching example](Documentation/3. Expressive matching.md) where tokenizing using a single TokenType returns results of the actual type used, we can even access the `Date` object associated with the token.
 
 ````Swift
 import Mustard
 
-let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned)"
+let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/03/27 (Arrived) ref: 99/99/99"
 
-let tokens = messyInput.tokens(from: DateMatch.tokenizer)
-// tokens.count -> 1
+let tokens:[DateToken.Token] = messyInput.tokens()
+// tokens.count -> 2
+// ('99/99/99' is not matched by `DateToken`)
+//
+// first date
 // tokens[0].text -> "12/01/27"
+// tokens[0].tokenizer -> DateToken()
+// tokens[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000)
+//
+// last date
+// tokens[1].text -> "12/03/27"
+// tokens[1].tokenizer -> DateToken()
+// tokens[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000)
 ````
 
 See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes fuzzy matching of a literal String, but ignoring certain characters.
diff --git a/Mustard/Mustard/Mustard.swift b/Mustard/Mustard/Mustard.swift
@@ -11,11 +11,11 @@ import Foundation
 public extension String {
 
     /// Returns tokens matching a single `TokenType` implied by the generic signature
-    func tokens<T: TokenType>() -> [(tokenType: T, text: String, range: Range<String.Index>)] {
+    func tokens<T: TokenType>() -> [(tokenizer: T, text: String, range: Range<String.Index>)] {
 
         return self.tokens(from: T()).flatMap({
-            if let tokenType = $0.tokenType as? T {
-                return (tokenType: tokenType, text: $0.text, range: $0.range)
+            if let tokenType = $0.tokenizer as? T {
+                return (tokenizer: tokenType, text: $0.text, range: $0.range)
             }
             else { return nil }
         })
@@ -64,7 +64,7 @@ public extension String {
                         // the token could be completed, so will add to matches
 
                         matches.append(
-                            (tokenType: token,
+                            (tokenizer: token.tokenizerForMatch,
                              text: text[start..<next],
                              range: start..<next)
                         )

diff --git a/Mustard/Mustard/TokenType.swift b/Mustard/Mustard/TokenType.swift
@@ -13,7 +13,7 @@ import Foundation
 /// - tokenType: The instance of `TokenType` that matched the token.
 /// - text: The text that the token matched.
 /// - range: The range of the matched text in the original input.
-public typealias Token = (tokenType: TokenType, text: String, range: Range<String.Index>)
+public typealias Token = (tokenizer: TokenType, text: String, range: Range<String.Index>)
 
 public protocol TokenType {
 
@@ -66,10 +66,19 @@ public protocol TokenType {
     /// Initialize an empty instance.
     init()
 
+    /// Returns a new instance of a token that's a copy of the reciever.
+    ///
+    /// The object returned is set as the `tokenizer` element from a call to `tokens()`
+    /// If the type implements NSCopying protocol, the default implementation returns the result of
+    /// `copy(with: nil)`; otherwise, returns self.
+    var tokenizerForMatch: TokenType { get }
+
 }
 
 public extension TokenType {
 
+    typealias Token = (tokenizer: Self, text: String, range: Range<String.Index>)
+
     static var tokenizer: TokenType { return Self() }
 
     func canStart(with scalar: UnicodeScalar) -> Bool {
@@ -93,4 +102,13 @@ public extension TokenType {
     }
 
     func prepareForReuse() {}
+
+    var tokenizerForMatch: TokenType {
+        if let copying = self as? NSCopying, let aCopy = copying.copy(with: nil) as? TokenType {
+            return aCopy
+        }
+        else {
+            return self
+        }
+    }
 }
diff --git a/Mustard/MustardTests/CharacterSetTokenTests.swift b/Mustard/MustardTests/CharacterSetTokenTests.swift
@@ -26,19 +26,19 @@ class CharacterSetTokenTests: XCTestCase {
 
         XCTAssert(tokens.count == 5, "Unexpected number of characterset tokens [\(tokens.count)]")
 
-        XCTAssert(tokens[0].tokenType == CharacterSet.decimalDigits)
+        XCTAssert(tokens[0].tokenizer == CharacterSet.decimalDigits)
         XCTAssert(tokens[0].text == "123")
 
-        XCTAssert(tokens[1].tokenType == CharacterSet.letters)
+        XCTAssert(tokens[1].tokenizer == CharacterSet.letters)
         XCTAssert(tokens[1].text == "Hello")
 
-        XCTAssert(tokens[2].tokenType == CharacterSet.letters)
+        XCTAssert(tokens[2].tokenizer == CharacterSet.letters)
         XCTAssert(tokens[2].text == "world")
 
-        XCTAssert(tokens[3].tokenType == CharacterSet.decimalDigits)
+        XCTAssert(tokens[3].tokenizer == CharacterSet.decimalDigits)
         XCTAssert(tokens[3].text == "45")
 
-        XCTAssert(tokens[4].tokenType == CharacterSet.decimalDigits)
+        XCTAssert(tokens[4].tokenizer == CharacterSet.decimalDigits)
         XCTAssert(tokens[4].text == "67")
 
     }

diff --git a/Mustard/MustardTests/CustomTokenTests.swift b/Mustard/MustardTests/CustomTokenTests.swift
@@ -40,16 +40,16 @@ class CustomTokenTests: XCTestCase {
 
         XCTAssert(tokens.count == 4, "Unexpected number of tokens [\(tokens.count)]")
 
-        XCTAssert(tokens[0].tokenType is NumberToken)
+        XCTAssert(tokens[0].tokenizer is NumberToken)
         XCTAssert(tokens[0].text == "123")
 
-        XCTAssert(tokens[1].tokenType is WordToken)
+        XCTAssert(tokens[1].tokenizer is WordToken)
         XCTAssert(tokens[1].text == "Hello")
 
-        XCTAssert(tokens[2].tokenType is WordToken)
+        XCTAssert(tokens[2].tokenizer is WordToken)
         XCTAssert(tokens[2].text == "world")
 
-        XCTAssert(tokens[3].tokenType is NumberToken)
+        XCTAssert(tokens[3].tokenizer is NumberToken)
         XCTAssert(tokens[3].text == "45.67")
     }
 }