Skip to content

Commit

Permalink
German Postcodes (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
MayBaMay authored and maylis baschet committed Mar 11, 2022
1 parent 8ed3dd3 commit a68e3e0
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 16 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)


## [4.5.1] - 2022-03-11

### Added
- Detection of german departments with postcodes


## [4.5.0] - 2021-10-11

### Changed
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ pip install https://github.com/jurismarches/geoconvert/archive/master.zip
('CA', 'QC')
>>> address_to_country_and_subdivision_codes("1800 W Erie Ave, Lorain, OH 44052") # US
('US', 'OH')
>>> address_to_country_and_subdivision_codes("96524 Föritztal OT Neuhaus-Schierschnitz") # DE not found
(None, None)
>>> address_to_country_and_subdivision_codes("96524 Föritztal OT Neuhaus-Schierschnitz Germany") # DE found
('DE', 'TH')
>>> address_to_country_and_subdivision_codes("Kairo", lang="de")
('EG', None)
>>> address_to_country_and_subdivision_codes("Kairo", lang="de", country="EG")
Expand Down Expand Up @@ -158,6 +162,7 @@ There should be no confusion between French and US postcodes:
```python
>>> address_to_subdivision_code("2 pl. Saint-Pierre, 44000 Nantes", country="US")
>>> address_to_subdivision_code("1800 W Erie Ave, Lorain, OH 44052", country="FR")
>>> address_to_subdivision_code("96524 Föritztal OT Neuhaus-Schierschnitz", country="US")

```

Expand Down Expand Up @@ -331,19 +336,21 @@ Result format can be choosen between a tuple and an iso code
('DE', 'BB')
>>> address_to_country_and_subdivision_codes("14467 Potsdam", iso_format=True)
'DE-BB'
>>> address_to_country_and_subdivision_codes("14467 Germany")
>>> address_to_country_and_subdivision_codes("Eschborn Germany")
('DE', None)
>>> address_to_country_and_subdivision_codes("14467 Germany", iso_format=True)
>>> address_to_country_and_subdivision_codes("Eschborn Germany", iso_format=True)
'DE'

```

There should be no confusion between French and US postcodes:
There should be no confusion between french, deutsch and US postcodes:
```python
>>> address_to_country_and_subdivision_codes("2 pl. Saint-Pierre, 44000 Nantes", country="US")
(None, None)
>>> address_to_country_and_subdivision_codes("6931 Rings Rd, Amlin, OH 43002", country="FR")
(None, None)
>>> address_to_country_and_subdivision_codes("Straße 3 53119 Bonn", country="US")
(None, None)

```

Expand Down
41 changes: 36 additions & 5 deletions geoconvert/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
BR_POSTCODES_RANGE,
CA_POSTCODE_FIRST_LETTER_TO_PROVINCE_CODE,
DE_HAUPTSTADT,
DE_POSTCODE_RANGE,
br_postcode_regex,
br_state_code_regex,
br_state_name_regex,
Expand All @@ -17,6 +18,7 @@
de_land_hauptstadt_regex,
de_land_name_regex,
de_landers,
de_postcode_regex,
fr_department_name_regex,
fr_departments,
fr_postcode_regex,
Expand Down Expand Up @@ -141,12 +143,29 @@ def de_address_to_land_code(text):
code = de_hauptstadt_to_land_code(text)
if code:
return code
code = de_postcode_to_land_code(text)
if code is not None:
return code
# Look for the land code in the plain text
code_match = re.search(de_land_code_regex, text)
if code_match:
return code_match.group("code").upper()


def de_postcode_to_land_code(text):
# A German postcode is made of 5 digit
de_postcode_match = re.search(de_postcode_regex, text)
if not de_postcode_match:
return

postcode = int(de_postcode_match.group("postcode"))
state_code = None
for max_range, state_code in DE_POSTCODE_RANGE.items():
if max_range >= postcode:
break
return state_code


def de_hauptstadt_to_land_code(text):
text = safe_string(text)

Expand Down Expand Up @@ -470,14 +489,15 @@ def address_to_country_code(text, lang=None):
# - French postcode matching, because 5-digit postcodes are used in
# many countries.
# - German postcode matching, because 5-digit postcodes are used in
# many countries and the postcodes do not follow landers boudaries
# many countries
country_to_safe_subdivision_lookup_function = (
("BR", br_state_name_to_state_code),
("BR", br_postcode_to_state_code),
("CA", ca_postcode_to_province_code),
("CA", ca_province_name_to_province_code),
("FR", fr_dept_name_to_dept_code),
("DE", de_address_to_land_code),
("DE", de_hauptstadt_to_land_code),
("DE", de_land_name_to_land_code),
("US", us_postcode_to_state_code),
("US", us_state_name_to_state_code),
)
Expand Down Expand Up @@ -507,9 +527,11 @@ def address_to_subdivision_code(text, country=None):
>>> address_to_subdivision_code("")
>>> address_to_subdivision_code("2 pl. Saint-Pierre, 44000 Nantes", country="ca")
There should not be false positives between French and American postcodes:
There should not be false positives between French, deutsch and American postcodes:
>>> address_to_subdivision_code("2 pl. Saint-Pierre, 44000 Nantes", country="us")
>>> address_to_subdivision_code("Los Angeles, CA 90068, États-Unis", country="fr")
>>> address_to_subdivision_code("29633 Munster")
>>> address_to_subdivision_code("29633 Munster", country="US")
"""
# Find the subdivision code according to the country.
if country:
Expand Down Expand Up @@ -598,21 +620,30 @@ def address_to_country_and_subdivision_codes(
>>> address_to_country_and_subdivision_codes("2 pl. Saint-Pierre, Nantes")
(None, None)
There should be no confusion between French and US postcodes:
There should be no confusion between French, deutsch and US postcodes:
>>> address_to_country_and_subdivision_codes("2 pl. Saint-Pierre, 44000 Nantes", country="US")
(None, None)
>>> address_to_country_and_subdivision_codes("6931 Rings Rd, Amlin, OH 43002", country="FR")
(None, None)
>>> address_to_country_and_subdivision_codes("29633 Munster")
(None, None)
>>> address_to_country_and_subdivision_codes("29633 Munster", country="US")
(None, None)
You can choose to get results in a tuple or in iso format
>>> address_to_country_and_subdivision_codes("14467 Potsdam")
('DE', 'BB')
>>> address_to_country_and_subdivision_codes("14467 Potsdam", iso_format=True)
'DE-BB'
>>> address_to_country_and_subdivision_codes("14467 Germany")
('DE', None)
('DE', 'BE')
>>> address_to_country_and_subdivision_codes("14467 Germany", iso_format=True)
'DE-BE'
>>> address_to_country_and_subdivision_codes("Munster Germany")
('DE', None)
>>> address_to_country_and_subdivision_codes("Munster Germany", iso_format=True)
'DE'
"""
result = _address_to_country_and_subdivision_codes(text, lang, country)
if iso_format:
Expand Down
2 changes: 2 additions & 0 deletions geoconvert/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
)
from .subdivisions.germany import (
DE_HAUPTSTADT,
DE_POSTCODE_RANGE,
de_land_code_regex,
de_land_hauptstadt_regex,
de_land_name_regex,
de_landers,
de_postcode_regex,
)
from .subdivisions.united_states import (
us_postcode_regex,
Expand Down
76 changes: 76 additions & 0 deletions geoconvert/data/subdivisions/germany.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,80 @@
"erfurt": "TH",
}

# source https://simple.wikipedia.org/wiki/Postal_codes_in_Germany
DE_POSTCODE_RANGE = {
1944: "SN",
1990: "BB",
2999: "SN",
3999: "BB",
4616: "SN",
4617: "TH",
4625: "SN",
4626: "TH",
4930: "SN",
4931: "BB",
4999: "SN",
5999: "NI",
6576: "ST",
6577: "TH",
6999: "ST",
7999: "TH",
9999: "SN",
14999: "BE",
16999: "BB",
19347: "MV",
19348: "BB",
19999: "MV",
21243: "HH",
21244: "NI",
21679: "SH",
21730: "NI",
21999: "SH",
22999: "HH",
25999: "SH",
26999: "NI",
28999: "HB",
31999: "NI",
33999: "NW",
36999: "HE",
37300: "NI",
37359: "TH",
39999: "NI",
48999: "NW",
49999: "NI",
52999: "NW",
53110: "RP",
53229: "NW",
53720: "RP",
53721: "NW",
53808: "RP",
53809: "NW",
54999: "RP",
55547: "RP",
55558: "NW",
56999: "RP",
57611: "NW",
57612: "RP",
59999: "NW",
63915: "HE",
63916: "BY",
65623: "HE",
65624: "RP",
65999: "HE",
66999: "SL",
66849: "RP",
66999: "SL",
66879: "RP",
66999: "SL",
66894: "RP",
66999: "SL",
79999: "BW",
87999: "BY",
88999: "BW",
95999: "BY",
99999: "TH",
}


# Regexes
names = r"\b|\b".join(code for code in de_landers.keys())
Expand All @@ -55,3 +129,5 @@

hauptstadt = r"\b|\b".join(code for code in DE_HAUPTSTADT.keys())
de_land_hauptstadt_regex = re.compile(rf"(?P<hauptstadt>\b{hauptstadt}\b)")

de_postcode_regex = re.compile(r"\b(?P<postcode>\d{5})")
61 changes: 53 additions & 8 deletions tests/test_subdivisions/test_germany.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,36 @@
de_address_to_land_code,
de_hauptstadt_to_land_code,
de_land_name_to_land_code,
de_postcode_to_land_code,
)


class TestGermany:
@pytest.mark.parametrize(
"input_data, expected",
[
("Katharinenberg 14 – 20 18439 Stralsund ", None),
("Katharinenberg 14 – 20 18439 Stralsund ", "MV"),
# Detection via province code should be case-sensitive
("Be yourself!", None),
("This is BE", "BE"),
("31-37 53179 Bonn Telefon: +49 228 4010 Fax: +49 228 4011223", "NW"),
],
)
def test_de_address_to_land_code(self, input_data, expected):
assert de_address_to_land_code(input_data) == expected

@pytest.mark.parametrize(
"input_data, expected",
[
("65760 Eschborn", "HE"),
("Straße 3 53119 Bonn Telefon: +49 22899610-2928", "NW"),
("80639 München", "BY"),
],
)
def test_de_postcode_to_land_code(self, input_data, expected):
assert de_postcode_to_land_code(input_data)
assert de_address_to_land_code(input_data) == expected

@pytest.mark.parametrize(
"input_data, expected",
[
Expand All @@ -37,11 +51,6 @@ def test_de_address_to_land_code(self, input_data, expected):
def test_de_land_name_to_land_code(self, input_data, expected):
assert de_land_name_to_land_code(input_data) == expected
assert de_address_to_land_code(input_data) == expected
assert address_to_country_and_subdivision_codes(input_data) == ("DE", expected)
assert (
address_to_country_and_subdivision_codes(input_data, iso_format=True)
== "DE-" + expected
)

@pytest.mark.parametrize(
"input_data, expected",
Expand All @@ -55,8 +64,44 @@ def test_de_land_name_to_land_code(self, input_data, expected):
def test_de_hauptstadt_to_land_code(self, input_data, expected):
assert de_hauptstadt_to_land_code(input_data) == expected
assert de_address_to_land_code(input_data) == expected
assert address_to_country_and_subdivision_codes(input_data) == ("DE", expected)

@pytest.mark.parametrize(
"input_data, expected",
[
# land name
("Hamburg liegt in Norddeutschland", ("DE", "HH")),
("Metropolregion Hamburg", ("DE", "HH")),
("Glockengießerwall 5 20095 Hamburg", ("DE", "HH")),
("Nordrhein Westfalen", ("DE", "NW")),
("nordrhein-westfalen", ("DE", "NW")),
("Thüringen", ("DE", "TH")),
("thuringen", ("DE", "TH")),
("The capital of Germany is Berlin", ("DE", "BE")),
# hauptstadt
("Humboldtstraße 5–6 14467 Potsdam", ("DE", "BB")),
("is in Frankfurt-Am-Main", ("DE", "HE")),
("GUTEN TAG KÖLN", ("DE", "NW")),
("leipzig", ("DE", "SN")),
("80639 München", ("DE", "BY")),
# land code
("Be yourself!", (None, None)),
("This is BE", (None, None)),
("This is BE in Germany", ("DE", "BE")),
# Postcodes
("Katharinenberg 14 – 20 18439 Stralsund", (None, None)),
("Katharinenberg 14 – 20 18439 Stralsund GERMANY", ("DE", "MV")),
("31-37 53179 Bonn Telefon: +49 228 4010", (None, None)),
("31-37 53179 Bonn DEUTSCHLAND Telefon: +49 228 4010", ("DE", "NW")),
("65760 Eschborn", (None, None)),
("65760 Eschborn germany", ("DE", "HE")),
],
)
def test_de_country_and_subdivision_codes(self, input_data, expected):
assert address_to_country_and_subdivision_codes(input_data) == expected
expected = (
f"{expected[0]}-{expected[1]}" if not expected == (None, None) else None
)
assert (
address_to_country_and_subdivision_codes(input_data, iso_format=True)
== "DE-" + expected
== expected
)

0 comments on commit a68e3e0

Please sign in to comment.