diff --git a/python/phonenumbers/asyoutypeformatter.py b/python/phonenumbers/asyoutypeformatter.py index 465c53b42..cf2c66c68 100644 --- a/python/phonenumbers/asyoutypeformatter.py +++ b/python/phonenumbers/asyoutypeformatter.py @@ -229,7 +229,7 @@ def _clear(self): self._should_add_space_after_national_prefix = False # This contains the national prefix that has been extracted. It # contains only digits without formatting. - self._national_prefix_extracted = U_EMPTY_STRING + self._extracted_national_prefix = U_EMPTY_STRING self._national_number = U_EMPTY_STRING # This indicates whether AsYouTypeFormatter is currently doing the # formatting. @@ -328,7 +328,7 @@ def input_digit(self, next_char, remember_position=False): self._is_expecting_country_calling_code = True else: # No IDD or plus sign is found, might be entering in national format. - self._national_prefix_extracted = self._remove_national_prefix_from_national_number() + self._extracted_national_prefix = self._remove_national_prefix_from_national_number() self._current_output = self._attempt_to_choose_formatting_pattern() return self._current_output if self._is_expecting_country_calling_code: @@ -370,17 +370,17 @@ def _attempt_to_choose_pattern_with_prefix_extracted(self): # shorter NDD doesn't result in a number we can format, we try to see if # we can extract a longer version here. def _able_to_extract_longer_ndd(self): - if len(self._national_prefix_extracted) > 0: + if len(self._extracted_national_prefix) > 0: # Put the extracted NDD back to the national number before # attempting to extract a new NDD. - self._national_number = self._national_prefix_extracted + self._national_number + self._national_number = self._extracted_national_prefix + self._national_number # Remove the previously extracted NDD from # prefixBeforeNationalNumber. We cannot simply set it to empty # string because people sometimes incorrectly enter national # prefix after the country code, e.g. +44 (0)20-1234-5678. - index_of_previous_ndd = self._prefix_before_national_number.rfind(self._national_prefix_extracted) + index_of_previous_ndd = self._prefix_before_national_number.rfind(self._extracted_national_prefix) self._prefix_before_national_number = self._prefix_before_national_number[:index_of_previous_ndd] - return self._national_prefix_extracted != self._remove_national_prefix_from_national_number() + return self._extracted_national_prefix != self._remove_national_prefix_from_national_number() def _is_digit_or_leading_plus_sign(self, next_char): return (next_char.isdigit() or @@ -548,6 +548,9 @@ def _attempt_to_extract_ccc(self): self._prefix_before_national_number += str(country_code) self._prefix_before_national_number += _SEPARATOR_BEFORE_NATIONAL_NUMBER + # When we have successfully extracted the IDD, the previously + # extracted NDD should be cleared because it is no longer valid. + self._extracted_national_prefix = U_EMPTY_STRING return True def _normalize_and_accrue_digits_and_plus_sign(self, next_char, remember_position): diff --git a/python/phonenumbers/phonenumbermatcher.py b/python/phonenumbers/phonenumbermatcher.py index 3de73ad7d..85a9d7c5a 100644 --- a/python/phonenumbers/phonenumbermatcher.py +++ b/python/phonenumbers/phonenumbermatcher.py @@ -138,13 +138,36 @@ def _limit(lower, upper): _TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")) _TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d")) -# Matches white-space, which may indicate the end of a phone number and the -# start of something else (such as a neighbouring zip-code). If white-space is -# found, continues to match all characters that are not typically used to -# start a phone number. -_GROUP_SEPARATOR = re.compile(u("(?u)\\s") + # Unicode Separator, \p{Z} - u("[^") + _LEAD_CLASS_CHARS + - u("\\d]*")) # Unicode Decimal Digit Number, \p{Nd} +# Patterns used to extract phone numbers from a larger phone-number-like +# pattern. These are ordered according to specificity. For example, +# white-space is last since that is frequently used in numbers, not just to +# separate two numbers. We have separate patterns since we don't want to break +# up the phone-number-like text on more than one different kind of symbol at +# one time, although symbols of the same type (e.g. space) can be safely +# grouped together. +# +# Note that if there is a match, we will always check any text found up to the +# first match as well. +_INNER_MATCHES = ( + # Breaks on the slash - e.g. "651-234-2345/332-445-1234" + re.compile(u("/+(.*)")), + # Note that the bracket here is inside the capturing group, since we + # consider it part of the phone number. Will match a pattern like "(650) + # 223 3345 (754) 223 3321". + re.compile(u("(\\([^(]*)")), + # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We + # require a space on either side of the hyphen for it to be considered a + # separator. + re.compile(u("(?u)(?:\\p{Z}-|-\\s)\\s*(.+)")), + # Various types of wide hyphens. Note we have decided not to enforce a + # space here, since it's possible that it's supposed to be used to break + # two numbers without spaces, and we haven't seen many instances of it + # used within a number. + re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")), + # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number." + re.compile(u("(?u)\\.+\\s*([^.]+)")), + # Breaks on space - e.g. "3324451234 8002341234" + re.compile(u("(?u)\\s+(\\S+)"))) class Leniency(object): @@ -553,8 +576,7 @@ def _extract_match(self, candidate, offset): """ # Skip a match that is more likely a publication page reference or a # date. - if (_PUB_PAGES.search(candidate) or - _SLASH_SEPARATED_DATES.search(candidate)): + if (_SLASH_SEPARATED_DATES.search(candidate)): return None # Skip potential time-stamps. @@ -581,50 +603,26 @@ def _extract_inner_match(self, candidate, offset): offset -- The current offset of candidate within text Returns the match found, None if none can be found """ - # Try removing either the first or last "group" in the number and see - # if this gives a result. We consider white space to be a possible - # indication of the start or end of the phone number. - group_match = _GROUP_SEPARATOR.search(candidate) - if group_match: - # Try the first group by itself. - first_group_only = candidate[:group_match.start()] - first_group_only = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, - first_group_only) - match = self._parse_and_verify(first_group_only, offset) - if match is not None: - return match - self._max_tries -= 1 - - without_first_group_start = group_match.end() - # Try the rest of the candidate without the first group. - without_first_group = candidate[without_first_group_start:] - without_first_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, - without_first_group) - match = self._parse_and_verify(without_first_group, offset + without_first_group_start) - if match is not None: - return match - self._max_tries -= 1 - - if self._max_tries > 0: - last_group_start = without_first_group_start - group_match = _GROUP_SEPARATOR.search(candidate, last_group_start) - while group_match: - # Find the last group. - last_group_start = group_match.start() - group_match = _GROUP_SEPARATOR.search(candidate, group_match.end()) - without_last_group = candidate[:last_group_start] - without_last_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, - without_last_group) - if without_last_group == first_group_only: - # If there are only two groups, then the group "without - # the last group" is the same as the first group. In these - # cases, we don't want to re-check the number group, so we - # exit already. - return None - match = self._parse_and_verify(without_last_group, offset) + for possible_inner_match in _INNER_MATCHES: + group_match = possible_inner_match.search(candidate) + is_first_match = True + while group_match and self._max_tries > 0: + if is_first_match: + # We should handle any group before this one too. + group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, + candidate[:group_match.start()]) + match = self._parse_and_verify(group, offset) + if match is not None: + return match + self._max_tries -= 1 + is_first_match = False + group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN, + group_match.group(1)) + match = self._parse_and_verify(group, offset + group_match.start(1)) if match is not None: return match self._max_tries -= 1 + group_match = possible_inner_match.search(candidate, group_match.start() + 1) return None def _parse_and_verify(self, candidate, offset): @@ -640,7 +638,7 @@ def _parse_and_verify(self, candidate, offset): try: # Check the candidate doesn't contain any formatting which would # indicate that it really isn't a phone number. - if not fullmatch(_MATCHING_BRACKETS, candidate): + if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)): return None # If leniency is set to VALID or stricter, we also want to skip diff --git a/python/phonenumbers/phonenumberutil.py b/python/phonenumbers/phonenumberutil.py index 6677b1faf..dc02a86e9 100644 --- a/python/phonenumbers/phonenumberutil.py +++ b/python/phonenumbers/phonenumberutil.py @@ -10,7 +10,6 @@ http://www.iso.org/iso/country_codes/iso_3166_code_lists/country_names_and_code_elements.htm author: Shaopeng Jia (original Java version) -author: Lara Rennie (original Java Version) author: David Drysdale (Python version) """ # Based on original Java code: @@ -73,7 +72,7 @@ _MIN_LENGTH_FOR_NSN = 2 # The ITU says the maximum length should be 15, but we have found longer # numbers in Germany. -_MAX_LENGTH_FOR_NSN = 16 +_MAX_LENGTH_FOR_NSN = 17 # The maximum length of the country calling code. _MAX_LENGTH_COUNTRY_CODE = 3 # We don't allow input strings for parsing to be longer than 250 chars. This diff --git a/python/tests/asyoutypetest.py b/python/tests/asyoutypetest.py index eabf1b355..cac66e2ee 100644 --- a/python/tests/asyoutypetest.py +++ b/python/tests/asyoutypetest.py @@ -1104,6 +1104,40 @@ def testAYTFShortNumberFormattingFix_US(self): self.assertEqual("12", formatter.input_digit('2')) self.assertEqual("1 22", formatter.input_digit('2')) + def testAYTFClearNDDAfterIDDExtraction(self): + formatter = AsYouTypeFormatter("KR") + + # Check that when we have successfully extracted an IDD, the + # previously extracted NDD is cleared since it is no longer valid. + self.assertEqual("0", formatter.input_digit('0')) + self.assertEqual("00", formatter.input_digit('0')) + self.assertEqual("007", formatter.input_digit('7')) + self.assertEqual("0070", formatter.input_digit('0')) + self.assertEqual("00700", formatter.input_digit('0')) + self.assertEqual("0", formatter._extracted_national_prefix) + + # Once the IDD "00700" has been extracted, it no longer makes sense + # for the initial "0" to be treated as an NDD. + self.assertEqual("00700 1 ", formatter.input_digit('1')) + self.assertEqual("", formatter._extracted_national_prefix) + + self.assertEqual("00700 1 2", formatter.input_digit('2')) + self.assertEqual("00700 1 23", formatter.input_digit('3')) + self.assertEqual("00700 1 234", formatter.input_digit('4')) + self.assertEqual("00700 1 234 5", formatter.input_digit('5')) + self.assertEqual("00700 1 234 56", formatter.input_digit('6')) + self.assertEqual("00700 1 234 567", formatter.input_digit('7')) + self.assertEqual("00700 1 234 567 8", formatter.input_digit('8')) + self.assertEqual("00700 1 234 567 89", formatter.input_digit('9')) + self.assertEqual("00700 1 234 567 890", formatter.input_digit('0')) + self.assertEqual("00700 1 234 567 8901", formatter.input_digit('1')) + self.assertEqual("00700123456789012", formatter.input_digit('2')) + self.assertEqual("007001234567890123", formatter.input_digit('3')) + self.assertEqual("0070012345678901234", formatter.input_digit('4')) + self.assertEqual("00700123456789012345", formatter.input_digit('5')) + self.assertEqual("007001234567890123456", formatter.input_digit('6')) + self.assertEqual("0070012345678901234567", formatter.input_digit('7')) + def testAYTFShortNumberFormatting_AR(self): # Python version extra test: use real metadata formatter = AsYouTypeFormatter("AR") diff --git a/python/tests/phonenumbermatchertest.py b/python/tests/phonenumbermatchertest.py index 7d9d24abc..668eae532 100644 --- a/python/tests/phonenumbermatchertest.py +++ b/python/tests/phonenumbermatchertest.py @@ -283,7 +283,8 @@ def testFindNationalNumber(self): self.doTestFindInContext("64(0)64123456", "NZ") # Check that using a "/" is fine in a phone number. - self.doTestFindInContext("123/45678", "DE") + # Note that real Polish numbers do *not* start with a 0. + self.doTestFindInContext("0123/456789", "PL") self.doTestFindInContext("123-456-7890", "US") # See PhoneNumberUtilTest.testParseWithInternationalPrefixes(). @@ -418,34 +419,53 @@ def testIntermediateParsePositions(self): for ii in range(8, 20): self.assertEqualRange(text, ii, 19, 28) + def testFourMatchesInARow(self): + number1 = "415-666-7777" + number2 = "800-443-1223" + number3 = "212-443-1223" + number4 = "650-443-1223" + text = number1 + " - " + number2 + " - " + number3 + " - " + number4 + + matcher = PhoneNumberMatcher(text, "US") + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number1, "US") + + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number2, "US") + + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number3, "US") + + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number4, "US") + + def testMatchesFoundWithMultipleSpaces(self): + number1 = "(415) 666-7777" + number2 = "(800) 443-1223" + text = number1 + " " + number2 + + matcher = PhoneNumberMatcher(text, "US") + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number1, "US") + + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, text, number2, "US") + def testMatchWithSurroundingZipcodes(self): number = "415-666-7777" zipPreceding = "My address is CA 34215 - " + number + " is my number." - expectedResult = phonenumberutil.parse(number, "US") matcher = PhoneNumberMatcher(zipPreceding, "US") - if matcher.has_next(): - match = matcher.next() - else: - match = None - self.assertTrue(match is not None, - msg="Did not find a number in '" + zipPreceding + "'; expected " + number) - self.assertEqual(expectedResult, match.number) - self.assertEqual(number, match.raw_string) + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, zipPreceding, number, "US") # Now repeat, but this time the phone number has spaces in it. It should still be found. number = "(415) 666 7777" zipFollowing = "My number is " + number + ". 34215 is my zip-code." matcher = PhoneNumberMatcher(zipFollowing, "US") - if matcher.has_next(): - matchWithSpaces = matcher.next() - else: - matchWithSpaces = None - self.assertTrue(matchWithSpaces is not None, - msg="Did not find a number in '" + zipFollowing + "'; expected " + number) - self.assertEqual(expectedResult, matchWithSpaces.number) - self.assertEqual(number, matchWithSpaces.raw_string) + match = matcher.next() if matcher.has_next() else None + self.assertMatchProperties(match, zipFollowing, number, "US") def testIsLatinLetter(self): self.assertTrue(PhoneNumberMatcher._is_latin_letter('c')) @@ -599,10 +619,7 @@ def _doTestNumberMatchesForLeniency(self, testCases, leniency): wrongMatchFoundCount = 0 for test in testCases: iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency) - if iterator.has_next(): - match = iterator.next() - else: - match = None + match = iterator.next() if iterator.has_next() else None if match is None: noMatchFoundCount += 1 prnt("No match found in %s for leniency: %s" % (test, leniency), file=sys.stderr) @@ -617,10 +634,7 @@ def _doTestNumberNonMatchesForLeniency(self, testCases, leniency): matchFoundCount = 0 for test in testCases: iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency) - if iterator.has_next(): - match = iterator.next() - else: - match = None + match = iterator.next() if iterator.has_next() else None if match is not None: matchFoundCount += 1 prnt("Match found in %s for leniency: %s" % (test, leniency), file=sys.stderr) @@ -830,6 +844,15 @@ def assertEqualRange(self, text, index, start, end): self.assertEqual(end - index, match.end) self.assertEqual(sub[match.start:match.end], match.raw_string) + def assertMatchProperties(self, match, text, number, region): + """Asserts that the expected match is non-null, and that the raw string + and expected proto buffer are set appropriately.""" + expectedResult = phonenumberutil.parse(number, region) + self.assertTrue(match is not None, + msg="Did not find a number in '" + text + "'; expected " + number) + self.assertEqual(expectedResult, match.number) + self.assertEqual(number, match.raw_string) + def doTestFindInContext(self, number, defaultCountry): """Tests numbers found by PhoneNumberMatcher in various textual contexts""" self.findPossibleInContext(number, defaultCountry) @@ -893,10 +916,7 @@ def doTestInContext(self, number, defaultCountry, contextPairs, leniency): end = start + len(number) matcher = PhoneNumberMatcher(text, defaultCountry, leniency, 65535) - if matcher.has_next(): - match = matcher.next() - else: - match = None + match = matcher.next() if matcher.has_next() else None self.assertTrue(match is not None, msg="Did not find a number in '" + text + "'; expected '" + number + "'") @@ -934,8 +954,6 @@ def testDoubleExtensionX(self): # can't be used in a NumberTest). m0 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.POSSIBLE).next() self.assertEqual(xx_ext, m0.raw_string) - m1 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.VALID).next() - self.assertEqual("800 234 1 111", m1.raw_string) matcher2 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.STRICT_GROUPING) self.assertFalse(matcher2.has_next()) diff --git a/python/tests/phonenumberutiltest.py b/python/tests/phonenumberutiltest.py index 8382d783c..1680cf32c 100755 --- a/python/tests/phonenumberutiltest.py +++ b/python/tests/phonenumberutiltest.py @@ -725,10 +725,9 @@ def testFormatNumberForMobileDialing(self): # Test the special logic for NANPA countries, for which regular length phone numbers are always # output in international format, but short numbers are in national format. - usRegularNumber = PhoneNumber(country_code=1, national_number=6502530000) - self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(usRegularNumber, "US", False)) - self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(usRegularNumber, "CA", False)) - self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(usRegularNumber, "BR", False)) + self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(US_NUMBER, "US", False)) + self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(US_NUMBER, "CA", False)) + self.assertEqual("+16502530000", phonenumbers.format_number_for_mobile_dialing(US_NUMBER, "BR", False)) usShortNumber = PhoneNumber(country_code=1, national_number=911) self.assertEqual("911", phonenumbers.format_number_for_mobile_dialing(usShortNumber, "US", False)) self.assertEqual("", phonenumbers.format_number_for_mobile_dialing(usShortNumber, "CA", False)) @@ -1245,7 +1244,7 @@ def testIsPossibleNumberWithReason(self): self.assertEqual(ValidationResult.TOO_SHORT, phonenumbers.is_possible_number_with_reason(adNumber)) adNumber.country_code = 376 - adNumber.national_number = to_long(12345678901234567) + adNumber.national_number = to_long(123456789012345678) self.assertEqual(ValidationResult.TOO_LONG, phonenumbers.is_possible_number_with_reason(adNumber))