Merge code changes from upstream r650

sparkplug · Feb 28, 2014 · 966a707 · 966a707
1 parent a507305
commit 966a707
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 96 deletions.
diff --git a/python/phonenumbers/asyoutypeformatter.py b/python/phonenumbers/asyoutypeformatter.py
@@ -229,7 +229,7 @@ def _clear(self):
         self._should_add_space_after_national_prefix = False
         # This contains the national prefix that has been extracted. It
         # contains only digits without formatting.
-        self._national_prefix_extracted = U_EMPTY_STRING
+        self._extracted_national_prefix = U_EMPTY_STRING
         self._national_number = U_EMPTY_STRING
         # This indicates whether AsYouTypeFormatter is currently doing the
         # formatting.
@@ -328,7 +328,7 @@ def input_digit(self, next_char, remember_position=False):
                 self._is_expecting_country_calling_code = True
             else:
                 # No IDD or plus sign is found, might be entering in national format.
-                self._national_prefix_extracted = self._remove_national_prefix_from_national_number()
+                self._extracted_national_prefix = self._remove_national_prefix_from_national_number()
                 self._current_output = self._attempt_to_choose_formatting_pattern()
                 return self._current_output
         if self._is_expecting_country_calling_code:
@@ -370,17 +370,17 @@ def _attempt_to_choose_pattern_with_prefix_extracted(self):
     # shorter NDD doesn't result in a number we can format, we try to see if
     # we can extract a longer version here.
     def _able_to_extract_longer_ndd(self):
-        if len(self._national_prefix_extracted) > 0:
+        if len(self._extracted_national_prefix) > 0:
             # Put the extracted NDD back to the national number before
             # attempting to extract a new NDD.
-            self._national_number = self._national_prefix_extracted + self._national_number
+            self._national_number = self._extracted_national_prefix + self._national_number
             # Remove the previously extracted NDD from
             # prefixBeforeNationalNumber. We cannot simply set it to empty
             # string because people sometimes incorrectly enter national
             # prefix after the country code, e.g. +44 (0)20-1234-5678.
-            index_of_previous_ndd = self._prefix_before_national_number.rfind(self._national_prefix_extracted)
+            index_of_previous_ndd = self._prefix_before_national_number.rfind(self._extracted_national_prefix)
             self._prefix_before_national_number = self._prefix_before_national_number[:index_of_previous_ndd]
-        return self._national_prefix_extracted != self._remove_national_prefix_from_national_number()
+        return self._extracted_national_prefix != self._remove_national_prefix_from_national_number()
 
     def _is_digit_or_leading_plus_sign(self, next_char):
         return (next_char.isdigit() or
@@ -548,6 +548,9 @@ def _attempt_to_extract_ccc(self):
 
         self._prefix_before_national_number += str(country_code)
         self._prefix_before_national_number += _SEPARATOR_BEFORE_NATIONAL_NUMBER
+        # When we have successfully extracted the IDD, the previously
+        # extracted NDD should be cleared because it is no longer valid.
+        self._extracted_national_prefix = U_EMPTY_STRING
         return True
 
     def _normalize_and_accrue_digits_and_plus_sign(self, next_char, remember_position):

diff --git a/python/phonenumbers/phonenumbermatcher.py b/python/phonenumbers/phonenumbermatcher.py
@@ -138,13 +138,36 @@ def _limit(lower, upper):
 _TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$"))
 _TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))
 
-# Matches white-space, which may indicate the end of a phone number and the
-# start of something else (such as a neighbouring zip-code). If white-space is
-# found, continues to match all characters that are not typically used to
-# start a phone number.
-_GROUP_SEPARATOR = re.compile(u("(?u)\\s") +  # Unicode Separator, \p{Z}
-                              u("[^") + _LEAD_CLASS_CHARS +
-                              u("\\d]*"))  # Unicode Decimal Digit Number, \p{Nd}
+# Patterns used to extract phone numbers from a larger phone-number-like
+# pattern. These are ordered according to specificity. For example,
+# white-space is last since that is frequently used in numbers, not just to
+# separate two numbers. We have separate patterns since we don't want to break
+# up the phone-number-like text on more than one different kind of symbol at
+# one time, although symbols of the same type (e.g. space) can be safely
+# grouped together.
+#
+# Note that if there is a match, we will always check any text found up to the
+# first match as well.
+_INNER_MATCHES = (
+     # Breaks on the slash - e.g. "651-234-2345/332-445-1234"
+     re.compile(u("/+(.*)")),
+     # Note that the bracket here is inside the capturing group, since we
+     # consider it part of the phone number. Will match a pattern like "(650)
+     # 223 3345 (754) 223 3321".
+     re.compile(u("(\\([^(]*)")),
+     # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."  We
+     # require a space on either side of the hyphen for it to be considered a
+     # separator.
+     re.compile(u("(?u)(?:\\p{Z}-|-\\s)\\s*(.+)")),
+     # Various types of wide hyphens. Note we have decided not to enforce a
+     # space here, since it's possible that it's supposed to be used to break
+     # two numbers without spaces, and we haven't seen many instances of it
+     # used within a number.
+     re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
+     # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
+     re.compile(u("(?u)\\.+\\s*([^.]+)")),
+     # Breaks on space - e.g. "3324451234 8002341234"
+     re.compile(u("(?u)\\s+(\\S+)")))
 
 
 class Leniency(object):
@@ -553,8 +576,7 @@ def _extract_match(self, candidate, offset):
         """
         # Skip a match that is more likely a publication page reference or a
         # date.
-        if (_PUB_PAGES.search(candidate) or
-            _SLASH_SEPARATED_DATES.search(candidate)):
+        if (_SLASH_SEPARATED_DATES.search(candidate)):
             return None
 
         # Skip potential time-stamps.
@@ -581,50 +603,26 @@ def _extract_inner_match(self, candidate, offset):
         offset -- The current offset of candidate within text
         Returns the match found, None if none can be found
         """
-        # Try removing either the first or last "group" in the number and see
-        # if this gives a result.  We consider white space to be a possible
-        # indication of the start or end of the phone number.
-        group_match = _GROUP_SEPARATOR.search(candidate)
-        if group_match:
-            # Try the first group by itself.
-            first_group_only = candidate[:group_match.start()]
-            first_group_only = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
-                                                            first_group_only)
-            match = self._parse_and_verify(first_group_only, offset)
-            if match is not None:
-                return match
-            self._max_tries -= 1
-
-            without_first_group_start = group_match.end()
-            # Try the rest of the candidate without the first group.
-            without_first_group = candidate[without_first_group_start:]
-            without_first_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
-                                                               without_first_group)
-            match = self._parse_and_verify(without_first_group, offset + without_first_group_start)
-            if match is not None:
-                return match
-            self._max_tries -= 1
-
-            if self._max_tries > 0:
-                last_group_start = without_first_group_start
-                group_match = _GROUP_SEPARATOR.search(candidate, last_group_start)
-                while group_match:
-                    # Find the last group.
-                    last_group_start = group_match.start()
-                    group_match = _GROUP_SEPARATOR.search(candidate, group_match.end())
-                without_last_group = candidate[:last_group_start]
-                without_last_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
-                                                                  without_last_group)
-                if without_last_group == first_group_only:
-                    # If there are only two groups, then the group "without
-                    # the last group" is the same as the first group. In these
-                    # cases, we don't want to re-check the number group, so we
-                    # exit already.
-                    return None
-                match = self._parse_and_verify(without_last_group, offset)
+        for possible_inner_match in _INNER_MATCHES:
+            group_match = possible_inner_match.search(candidate)
+            is_first_match = True
+            while group_match and self._max_tries > 0:
+                if is_first_match:
+                    # We should handle any group before this one too.
+                    group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
+                                                         candidate[:group_match.start()])
+                    match = self._parse_and_verify(group, offset)
+                    if match is not None:
+                        return match
+                    self._max_tries -= 1
+                    is_first_match = False
+                group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
+                                                     group_match.group(1))
+                match = self._parse_and_verify(group, offset + group_match.start(1))
                 if match is not None:
                     return match
                 self._max_tries -= 1
+                group_match = possible_inner_match.search(candidate, group_match.start() + 1)
         return None
 
     def _parse_and_verify(self, candidate, offset):
@@ -640,7 +638,7 @@ def _parse_and_verify(self, candidate, offset):
         try:
             # Check the candidate doesn't contain any formatting which would
             # indicate that it really isn't a phone number.
-            if not fullmatch(_MATCHING_BRACKETS, candidate):
+            if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
                 return None
 
             # If leniency is set to VALID or stricter, we also want to skip

diff --git a/python/phonenumbers/phonenumberutil.py b/python/phonenumbers/phonenumberutil.py
@@ -10,7 +10,6 @@
 http://www.iso.org/iso/country_codes/iso_3166_code_lists/country_names_and_code_elements.htm
 
 author: Shaopeng Jia (original Java version)
-author: Lara Rennie (original Java Version)
 author: David Drysdale (Python version)
 """
 # Based on original Java code:
@@ -73,7 +72,7 @@
 _MIN_LENGTH_FOR_NSN = 2
 # The ITU says the maximum length should be 15, but we have found longer
 # numbers in Germany.
-_MAX_LENGTH_FOR_NSN = 16
+_MAX_LENGTH_FOR_NSN = 17
 # The maximum length of the country calling code.
 _MAX_LENGTH_COUNTRY_CODE = 3
 # We don't allow input strings for parsing to be longer than 250 chars. This

diff --git a/python/tests/asyoutypetest.py b/python/tests/asyoutypetest.py
@@ -1104,6 +1104,40 @@ def testAYTFShortNumberFormattingFix_US(self):
         self.assertEqual("12", formatter.input_digit('2'))
         self.assertEqual("1 22", formatter.input_digit('2'))
 
+    def testAYTFClearNDDAfterIDDExtraction(self):
+        formatter = AsYouTypeFormatter("KR")
+
+        # Check that when we have successfully extracted an IDD, the
+        # previously extracted NDD is cleared since it is no longer valid.
+        self.assertEqual("0", formatter.input_digit('0'))
+        self.assertEqual("00", formatter.input_digit('0'))
+        self.assertEqual("007", formatter.input_digit('7'))
+        self.assertEqual("0070", formatter.input_digit('0'))
+        self.assertEqual("00700", formatter.input_digit('0'))
+        self.assertEqual("0", formatter._extracted_national_prefix)
+
+        # Once the IDD "00700" has been extracted, it no longer makes sense
+        # for the initial "0" to be treated as an NDD.
+        self.assertEqual("00700 1 ", formatter.input_digit('1'))
+        self.assertEqual("", formatter._extracted_national_prefix)
+
+        self.assertEqual("00700 1 2", formatter.input_digit('2'))
+        self.assertEqual("00700 1 23", formatter.input_digit('3'))
+        self.assertEqual("00700 1 234", formatter.input_digit('4'))
+        self.assertEqual("00700 1 234 5", formatter.input_digit('5'))
+        self.assertEqual("00700 1 234 56", formatter.input_digit('6'))
+        self.assertEqual("00700 1 234 567", formatter.input_digit('7'))
+        self.assertEqual("00700 1 234 567 8", formatter.input_digit('8'))
+        self.assertEqual("00700 1 234 567 89", formatter.input_digit('9'))
+        self.assertEqual("00700 1 234 567 890", formatter.input_digit('0'))
+        self.assertEqual("00700 1 234 567 8901", formatter.input_digit('1'))
+        self.assertEqual("00700123456789012", formatter.input_digit('2'))
+        self.assertEqual("007001234567890123", formatter.input_digit('3'))
+        self.assertEqual("0070012345678901234", formatter.input_digit('4'))
+        self.assertEqual("00700123456789012345", formatter.input_digit('5'))
+        self.assertEqual("007001234567890123456", formatter.input_digit('6'))
+        self.assertEqual("0070012345678901234567", formatter.input_digit('7'))
+
     def testAYTFShortNumberFormatting_AR(self):
         # Python version extra test: use real metadata
         formatter = AsYouTypeFormatter("AR")

diff --git a/python/tests/phonenumbermatchertest.py b/python/tests/phonenumbermatchertest.py
@@ -283,7 +283,8 @@ def testFindNationalNumber(self):
 
         self.doTestFindInContext("64(0)64123456", "NZ")
         # Check that using a "/" is fine in a phone number.
-        self.doTestFindInContext("123/45678", "DE")
+        # Note that real Polish numbers do *not* start with a 0.
+        self.doTestFindInContext("0123/456789", "PL")
         self.doTestFindInContext("123-456-7890", "US")
 
     # See PhoneNumberUtilTest.testParseWithInternationalPrefixes().
@@ -418,34 +419,53 @@ def testIntermediateParsePositions(self):
         for ii in range(8, 20):
             self.assertEqualRange(text, ii, 19, 28)
 
+    def testFourMatchesInARow(self):
+        number1 = "415-666-7777"
+        number2 = "800-443-1223"
+        number3 = "212-443-1223"
+        number4 = "650-443-1223"
+        text = number1 + " - " + number2 + " - " + number3 + " - " + number4
+
+        matcher = PhoneNumberMatcher(text, "US")
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number1, "US")
+
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number2, "US")
+
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number3, "US")
+
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number4, "US")
+
+    def testMatchesFoundWithMultipleSpaces(self):
+        number1 = "(415) 666-7777"
+        number2 = "(800) 443-1223"
+        text = number1 + " " + number2
+
+        matcher = PhoneNumberMatcher(text, "US")
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number1, "US")
+
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, text, number2, "US")
+
     def testMatchWithSurroundingZipcodes(self):
         number = "415-666-7777"
         zipPreceding = "My address is CA 34215 - " + number + " is my number."
-        expectedResult = phonenumberutil.parse(number, "US")
 
         matcher = PhoneNumberMatcher(zipPreceding, "US")
-        if matcher.has_next():
-            match = matcher.next()
-        else:
-            match = None
-        self.assertTrue(match is not None,
-                        msg="Did not find a number in '" + zipPreceding + "'; expected " + number)
-        self.assertEqual(expectedResult, match.number)
-        self.assertEqual(number, match.raw_string)
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, zipPreceding, number, "US")
 
         # Now repeat, but this time the phone number has spaces in it. It should still be found.
         number = "(415) 666 7777"
 
         zipFollowing = "My number is " + number + ". 34215 is my zip-code."
         matcher = PhoneNumberMatcher(zipFollowing, "US")
-        if matcher.has_next():
-            matchWithSpaces = matcher.next()
-        else:
-            matchWithSpaces = None
-        self.assertTrue(matchWithSpaces is not None,
-                        msg="Did not find a number in '" + zipFollowing + "'; expected " + number)
-        self.assertEqual(expectedResult, matchWithSpaces.number)
-        self.assertEqual(number, matchWithSpaces.raw_string)
+        match = matcher.next() if matcher.has_next() else None
+        self.assertMatchProperties(match, zipFollowing, number, "US")
 
     def testIsLatinLetter(self):
         self.assertTrue(PhoneNumberMatcher._is_latin_letter('c'))
@@ -599,10 +619,7 @@ def _doTestNumberMatchesForLeniency(self, testCases, leniency):
         wrongMatchFoundCount = 0
         for test in testCases:
             iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency)
-            if iterator.has_next():
-                match = iterator.next()
-            else:
-                match = None
+            match = iterator.next() if iterator.has_next() else None
             if match is None:
                 noMatchFoundCount += 1
                 prnt("No match found in  %s for leniency: %s" % (test, leniency), file=sys.stderr)
@@ -617,10 +634,7 @@ def _doTestNumberNonMatchesForLeniency(self, testCases, leniency):
         matchFoundCount = 0
         for test in testCases:
             iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency)
-            if iterator.has_next():
-                match = iterator.next()
-            else:
-                match = None
+            match = iterator.next() if iterator.has_next() else None
             if match is not None:
                 matchFoundCount += 1
                 prnt("Match found in %s for leniency: %s" % (test, leniency), file=sys.stderr)
@@ -830,6 +844,15 @@ def assertEqualRange(self, text, index, start, end):
         self.assertEqual(end - index, match.end)
         self.assertEqual(sub[match.start:match.end], match.raw_string)
 
+    def assertMatchProperties(self, match, text, number, region):
+        """Asserts that the expected match is non-null, and that the raw string
+        and expected proto buffer are set appropriately."""
+        expectedResult = phonenumberutil.parse(number, region)
+        self.assertTrue(match is not None,
+                        msg="Did not find a number in '" + text + "'; expected " + number)
+        self.assertEqual(expectedResult, match.number)
+        self.assertEqual(number, match.raw_string)
+
     def doTestFindInContext(self, number, defaultCountry):
         """Tests numbers found by PhoneNumberMatcher in various textual contexts"""
         self.findPossibleInContext(number, defaultCountry)
@@ -893,10 +916,7 @@ def doTestInContext(self, number, defaultCountry, contextPairs, leniency):
             end = start + len(number)
             matcher = PhoneNumberMatcher(text, defaultCountry, leniency, 65535)
 
-            if matcher.has_next():
-                match = matcher.next()
-            else:
-                match = None
+            match = matcher.next() if matcher.has_next() else None
             self.assertTrue(match is not None,
                             msg="Did not find a number in '" + text + "'; expected '" + number + "'")
 
@@ -934,8 +954,6 @@ def testDoubleExtensionX(self):
         # can't be used in a NumberTest).
         m0 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.POSSIBLE).next()
         self.assertEqual(xx_ext, m0.raw_string)
-        m1 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.VALID).next()
-        self.assertEqual("800 234 1 111", m1.raw_string)
         matcher2 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.STRICT_GROUPING)
         self.assertFalse(matcher2.has_next())