Skip to content

Commit

Permalink
Merge code changes from upstream r650
Browse files Browse the repository at this point in the history
  • Loading branch information
daviddrysdale committed Feb 28, 2014
1 parent a507305 commit 966a707
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 96 deletions.
15 changes: 9 additions & 6 deletions python/phonenumbers/asyoutypeformatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def _clear(self):
self._should_add_space_after_national_prefix = False
# This contains the national prefix that has been extracted. It
# contains only digits without formatting.
self._national_prefix_extracted = U_EMPTY_STRING
self._extracted_national_prefix = U_EMPTY_STRING
self._national_number = U_EMPTY_STRING
# This indicates whether AsYouTypeFormatter is currently doing the
# formatting.
Expand Down Expand Up @@ -328,7 +328,7 @@ def input_digit(self, next_char, remember_position=False):
self._is_expecting_country_calling_code = True
else:
# No IDD or plus sign is found, might be entering in national format.
self._national_prefix_extracted = self._remove_national_prefix_from_national_number()
self._extracted_national_prefix = self._remove_national_prefix_from_national_number()
self._current_output = self._attempt_to_choose_formatting_pattern()
return self._current_output
if self._is_expecting_country_calling_code:
Expand Down Expand Up @@ -370,17 +370,17 @@ def _attempt_to_choose_pattern_with_prefix_extracted(self):
# shorter NDD doesn't result in a number we can format, we try to see if
# we can extract a longer version here.
def _able_to_extract_longer_ndd(self):
if len(self._national_prefix_extracted) > 0:
if len(self._extracted_national_prefix) > 0:
# Put the extracted NDD back to the national number before
# attempting to extract a new NDD.
self._national_number = self._national_prefix_extracted + self._national_number
self._national_number = self._extracted_national_prefix + self._national_number
# Remove the previously extracted NDD from
# prefixBeforeNationalNumber. We cannot simply set it to empty
# string because people sometimes incorrectly enter national
# prefix after the country code, e.g. +44 (0)20-1234-5678.
index_of_previous_ndd = self._prefix_before_national_number.rfind(self._national_prefix_extracted)
index_of_previous_ndd = self._prefix_before_national_number.rfind(self._extracted_national_prefix)
self._prefix_before_national_number = self._prefix_before_national_number[:index_of_previous_ndd]
return self._national_prefix_extracted != self._remove_national_prefix_from_national_number()
return self._extracted_national_prefix != self._remove_national_prefix_from_national_number()

def _is_digit_or_leading_plus_sign(self, next_char):
return (next_char.isdigit() or
Expand Down Expand Up @@ -548,6 +548,9 @@ def _attempt_to_extract_ccc(self):

self._prefix_before_national_number += str(country_code)
self._prefix_before_national_number += _SEPARATOR_BEFORE_NATIONAL_NUMBER
# When we have successfully extracted the IDD, the previously
# extracted NDD should be cleared because it is no longer valid.
self._extracted_national_prefix = U_EMPTY_STRING
return True

def _normalize_and_accrue_digits_and_plus_sign(self, next_char, remember_position):
Expand Down
100 changes: 49 additions & 51 deletions python/phonenumbers/phonenumbermatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,36 @@ def _limit(lower, upper):
_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$"))
_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))

# Matches white-space, which may indicate the end of a phone number and the
# start of something else (such as a neighbouring zip-code). If white-space is
# found, continues to match all characters that are not typically used to
# start a phone number.
_GROUP_SEPARATOR = re.compile(u("(?u)\\s") + # Unicode Separator, \p{Z}
u("[^") + _LEAD_CLASS_CHARS +
u("\\d]*")) # Unicode Decimal Digit Number, \p{Nd}
# Patterns used to extract phone numbers from a larger phone-number-like
# pattern. These are ordered according to specificity. For example,
# white-space is last since that is frequently used in numbers, not just to
# separate two numbers. We have separate patterns since we don't want to break
# up the phone-number-like text on more than one different kind of symbol at
# one time, although symbols of the same type (e.g. space) can be safely
# grouped together.
#
# Note that if there is a match, we will always check any text found up to the
# first match as well.
_INNER_MATCHES = (
# Breaks on the slash - e.g. "651-234-2345/332-445-1234"
re.compile(u("/+(.*)")),
# Note that the bracket here is inside the capturing group, since we
# consider it part of the phone number. Will match a pattern like "(650)
# 223 3345 (754) 223 3321".
re.compile(u("(\\([^(]*)")),
# Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
# require a space on either side of the hyphen for it to be considered a
# separator.
re.compile(u("(?u)(?:\\p{Z}-|-\\s)\\s*(.+)")),
# Various types of wide hyphens. Note we have decided not to enforce a
# space here, since it's possible that it's supposed to be used to break
# two numbers without spaces, and we haven't seen many instances of it
# used within a number.
re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
# Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
re.compile(u("(?u)\\.+\\s*([^.]+)")),
# Breaks on space - e.g. "3324451234 8002341234"
re.compile(u("(?u)\\s+(\\S+)")))


class Leniency(object):
Expand Down Expand Up @@ -553,8 +576,7 @@ def _extract_match(self, candidate, offset):
"""
# Skip a match that is more likely a publication page reference or a
# date.
if (_PUB_PAGES.search(candidate) or
_SLASH_SEPARATED_DATES.search(candidate)):
if (_SLASH_SEPARATED_DATES.search(candidate)):
return None

# Skip potential time-stamps.
Expand All @@ -581,50 +603,26 @@ def _extract_inner_match(self, candidate, offset):
offset -- The current offset of candidate within text
Returns the match found, None if none can be found
"""
# Try removing either the first or last "group" in the number and see
# if this gives a result. We consider white space to be a possible
# indication of the start or end of the phone number.
group_match = _GROUP_SEPARATOR.search(candidate)
if group_match:
# Try the first group by itself.
first_group_only = candidate[:group_match.start()]
first_group_only = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
first_group_only)
match = self._parse_and_verify(first_group_only, offset)
if match is not None:
return match
self._max_tries -= 1

without_first_group_start = group_match.end()
# Try the rest of the candidate without the first group.
without_first_group = candidate[without_first_group_start:]
without_first_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
without_first_group)
match = self._parse_and_verify(without_first_group, offset + without_first_group_start)
if match is not None:
return match
self._max_tries -= 1

if self._max_tries > 0:
last_group_start = without_first_group_start
group_match = _GROUP_SEPARATOR.search(candidate, last_group_start)
while group_match:
# Find the last group.
last_group_start = group_match.start()
group_match = _GROUP_SEPARATOR.search(candidate, group_match.end())
without_last_group = candidate[:last_group_start]
without_last_group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
without_last_group)
if without_last_group == first_group_only:
# If there are only two groups, then the group "without
# the last group" is the same as the first group. In these
# cases, we don't want to re-check the number group, so we
# exit already.
return None
match = self._parse_and_verify(without_last_group, offset)
for possible_inner_match in _INNER_MATCHES:
group_match = possible_inner_match.search(candidate)
is_first_match = True
while group_match and self._max_tries > 0:
if is_first_match:
# We should handle any group before this one too.
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
candidate[:group_match.start()])
match = self._parse_and_verify(group, offset)
if match is not None:
return match
self._max_tries -= 1
is_first_match = False
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
group_match.group(1))
match = self._parse_and_verify(group, offset + group_match.start(1))
if match is not None:
return match
self._max_tries -= 1
group_match = possible_inner_match.search(candidate, group_match.start() + 1)
return None

def _parse_and_verify(self, candidate, offset):
Expand All @@ -640,7 +638,7 @@ def _parse_and_verify(self, candidate, offset):
try:
# Check the candidate doesn't contain any formatting which would
# indicate that it really isn't a phone number.
if not fullmatch(_MATCHING_BRACKETS, candidate):
if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
return None

# If leniency is set to VALID or stricter, we also want to skip
Expand Down
3 changes: 1 addition & 2 deletions python/phonenumbers/phonenumberutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
http://www.iso.org/iso/country_codes/iso_3166_code_lists/country_names_and_code_elements.htm
author: Shaopeng Jia (original Java version)
author: Lara Rennie (original Java Version)
author: David Drysdale (Python version)
"""
# Based on original Java code:
Expand Down Expand Up @@ -73,7 +72,7 @@
_MIN_LENGTH_FOR_NSN = 2
# The ITU says the maximum length should be 15, but we have found longer
# numbers in Germany.
_MAX_LENGTH_FOR_NSN = 16
_MAX_LENGTH_FOR_NSN = 17
# The maximum length of the country calling code.
_MAX_LENGTH_COUNTRY_CODE = 3
# We don't allow input strings for parsing to be longer than 250 chars. This
Expand Down
34 changes: 34 additions & 0 deletions python/tests/asyoutypetest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,40 @@ def testAYTFShortNumberFormattingFix_US(self):
self.assertEqual("12", formatter.input_digit('2'))
self.assertEqual("1 22", formatter.input_digit('2'))

def testAYTFClearNDDAfterIDDExtraction(self):
formatter = AsYouTypeFormatter("KR")

# Check that when we have successfully extracted an IDD, the
# previously extracted NDD is cleared since it is no longer valid.
self.assertEqual("0", formatter.input_digit('0'))
self.assertEqual("00", formatter.input_digit('0'))
self.assertEqual("007", formatter.input_digit('7'))
self.assertEqual("0070", formatter.input_digit('0'))
self.assertEqual("00700", formatter.input_digit('0'))
self.assertEqual("0", formatter._extracted_national_prefix)

# Once the IDD "00700" has been extracted, it no longer makes sense
# for the initial "0" to be treated as an NDD.
self.assertEqual("00700 1 ", formatter.input_digit('1'))
self.assertEqual("", formatter._extracted_national_prefix)

self.assertEqual("00700 1 2", formatter.input_digit('2'))
self.assertEqual("00700 1 23", formatter.input_digit('3'))
self.assertEqual("00700 1 234", formatter.input_digit('4'))
self.assertEqual("00700 1 234 5", formatter.input_digit('5'))
self.assertEqual("00700 1 234 56", formatter.input_digit('6'))
self.assertEqual("00700 1 234 567", formatter.input_digit('7'))
self.assertEqual("00700 1 234 567 8", formatter.input_digit('8'))
self.assertEqual("00700 1 234 567 89", formatter.input_digit('9'))
self.assertEqual("00700 1 234 567 890", formatter.input_digit('0'))
self.assertEqual("00700 1 234 567 8901", formatter.input_digit('1'))
self.assertEqual("00700123456789012", formatter.input_digit('2'))
self.assertEqual("007001234567890123", formatter.input_digit('3'))
self.assertEqual("0070012345678901234", formatter.input_digit('4'))
self.assertEqual("00700123456789012345", formatter.input_digit('5'))
self.assertEqual("007001234567890123456", formatter.input_digit('6'))
self.assertEqual("0070012345678901234567", formatter.input_digit('7'))

def testAYTFShortNumberFormatting_AR(self):
# Python version extra test: use real metadata
formatter = AsYouTypeFormatter("AR")
Expand Down
82 changes: 50 additions & 32 deletions python/tests/phonenumbermatchertest.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def testFindNationalNumber(self):

self.doTestFindInContext("64(0)64123456", "NZ")
# Check that using a "/" is fine in a phone number.
self.doTestFindInContext("123/45678", "DE")
# Note that real Polish numbers do *not* start with a 0.
self.doTestFindInContext("0123/456789", "PL")
self.doTestFindInContext("123-456-7890", "US")

# See PhoneNumberUtilTest.testParseWithInternationalPrefixes().
Expand Down Expand Up @@ -418,34 +419,53 @@ def testIntermediateParsePositions(self):
for ii in range(8, 20):
self.assertEqualRange(text, ii, 19, 28)

def testFourMatchesInARow(self):
number1 = "415-666-7777"
number2 = "800-443-1223"
number3 = "212-443-1223"
number4 = "650-443-1223"
text = number1 + " - " + number2 + " - " + number3 + " - " + number4

matcher = PhoneNumberMatcher(text, "US")
match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number1, "US")

match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number2, "US")

match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number3, "US")

match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number4, "US")

def testMatchesFoundWithMultipleSpaces(self):
number1 = "(415) 666-7777"
number2 = "(800) 443-1223"
text = number1 + " " + number2

matcher = PhoneNumberMatcher(text, "US")
match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number1, "US")

match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, text, number2, "US")

def testMatchWithSurroundingZipcodes(self):
number = "415-666-7777"
zipPreceding = "My address is CA 34215 - " + number + " is my number."
expectedResult = phonenumberutil.parse(number, "US")

matcher = PhoneNumberMatcher(zipPreceding, "US")
if matcher.has_next():
match = matcher.next()
else:
match = None
self.assertTrue(match is not None,
msg="Did not find a number in '" + zipPreceding + "'; expected " + number)
self.assertEqual(expectedResult, match.number)
self.assertEqual(number, match.raw_string)
match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, zipPreceding, number, "US")

# Now repeat, but this time the phone number has spaces in it. It should still be found.
number = "(415) 666 7777"

zipFollowing = "My number is " + number + ". 34215 is my zip-code."
matcher = PhoneNumberMatcher(zipFollowing, "US")
if matcher.has_next():
matchWithSpaces = matcher.next()
else:
matchWithSpaces = None
self.assertTrue(matchWithSpaces is not None,
msg="Did not find a number in '" + zipFollowing + "'; expected " + number)
self.assertEqual(expectedResult, matchWithSpaces.number)
self.assertEqual(number, matchWithSpaces.raw_string)
match = matcher.next() if matcher.has_next() else None
self.assertMatchProperties(match, zipFollowing, number, "US")

def testIsLatinLetter(self):
self.assertTrue(PhoneNumberMatcher._is_latin_letter('c'))
Expand Down Expand Up @@ -599,10 +619,7 @@ def _doTestNumberMatchesForLeniency(self, testCases, leniency):
wrongMatchFoundCount = 0
for test in testCases:
iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency)
if iterator.has_next():
match = iterator.next()
else:
match = None
match = iterator.next() if iterator.has_next() else None
if match is None:
noMatchFoundCount += 1
prnt("No match found in %s for leniency: %s" % (test, leniency), file=sys.stderr)
Expand All @@ -617,10 +634,7 @@ def _doTestNumberNonMatchesForLeniency(self, testCases, leniency):
matchFoundCount = 0
for test in testCases:
iterator = self.findNumbersForLeniency(test.rawString, test.region, leniency)
if iterator.has_next():
match = iterator.next()
else:
match = None
match = iterator.next() if iterator.has_next() else None
if match is not None:
matchFoundCount += 1
prnt("Match found in %s for leniency: %s" % (test, leniency), file=sys.stderr)
Expand Down Expand Up @@ -830,6 +844,15 @@ def assertEqualRange(self, text, index, start, end):
self.assertEqual(end - index, match.end)
self.assertEqual(sub[match.start:match.end], match.raw_string)

def assertMatchProperties(self, match, text, number, region):
"""Asserts that the expected match is non-null, and that the raw string
and expected proto buffer are set appropriately."""
expectedResult = phonenumberutil.parse(number, region)
self.assertTrue(match is not None,
msg="Did not find a number in '" + text + "'; expected " + number)
self.assertEqual(expectedResult, match.number)
self.assertEqual(number, match.raw_string)

def doTestFindInContext(self, number, defaultCountry):
"""Tests numbers found by PhoneNumberMatcher in various textual contexts"""
self.findPossibleInContext(number, defaultCountry)
Expand Down Expand Up @@ -893,10 +916,7 @@ def doTestInContext(self, number, defaultCountry, contextPairs, leniency):
end = start + len(number)
matcher = PhoneNumberMatcher(text, defaultCountry, leniency, 65535)

if matcher.has_next():
match = matcher.next()
else:
match = None
match = matcher.next() if matcher.has_next() else None
self.assertTrue(match is not None,
msg="Did not find a number in '" + text + "'; expected '" + number + "'")

Expand Down Expand Up @@ -934,8 +954,6 @@ def testDoubleExtensionX(self):
# can't be used in a NumberTest).
m0 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.POSSIBLE).next()
self.assertEqual(xx_ext, m0.raw_string)
m1 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.VALID).next()
self.assertEqual("800 234 1 111", m1.raw_string)
matcher2 = PhoneNumberMatcher(xx_ext, "US", leniency=Leniency.STRICT_GROUPING)
self.assertFalse(matcher2.has_next())

Expand Down
Loading

0 comments on commit 966a707

Please sign in to comment.