-
Notifications
You must be signed in to change notification settings - Fork 586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Addition of three new predefined recognizers, improved regex for IN_PAN #1323
base: main
Are you sure you want to change the base?
Changes from 46 commits
818fe90
87a1aae
8756c93
2f85d5d
1b47061
b0d1ce8
b3e94ed
838402f
d4ae26d
1e81cfb
88c6c1f
b4edab4
2d01bd0
b7c6e65
2434bb5
b6db593
2dd5cec
dfb2d26
fd28708
f0c9737
a67f19f
8383e08
37b2f97
57b2294
365be21
3cdec15
28f8bec
bc059ce
1ffbb8b
b05399f
2a4708b
22003a4
3f00fdc
424174d
b0767aa
4133632
d1f2fc6
93a79cf
3053088
65a2e70
f4a1541
4391cd4
acf7331
e26da0f
e040ecc
7d6ee38
64407fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,4 +1,6 @@ | ||||||
from typing import List, Tuple | ||||||
import csv | ||||||
import os | ||||||
|
||||||
|
||||||
class PresidioAnalyzerUtils: | ||||||
|
@@ -9,6 +11,20 @@ class PresidioAnalyzerUtils: | |||||
logic for re-usability and maintainability | ||||||
""" | ||||||
|
||||||
__country_master_file_path__ = "presidio_analyzer/data/country_master.csv" | ||||||
__country_master__ = [] | ||||||
|
||||||
def __init__(self): | ||||||
# provision to override the default path for future need | ||||||
__country_master_file_path__ = "presidio_analyzer/data/country_master.csv" | ||||||
__country_master_file_path__ = ( | ||||||
__country_master_file_path__ | ||||||
if __country_master_file_path__ | ||||||
else self.__country_master_file_path__ | ||||||
) | ||||||
|
||||||
self.__load_country_master__() | ||||||
|
||||||
@staticmethod | ||||||
def is_palindrome(text: str, case_insensitive: bool = False): | ||||||
""" | ||||||
|
@@ -36,13 +52,33 @@ def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: | |||||
text = text.replace(search_string, replacement_string) | ||||||
return text | ||||||
|
||||||
@staticmethod | ||||||
def get_luhn_mod_n(input_str: str, alphabet="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | ||||||
""" | ||||||
Check if the given input number has a valid last checksum as per LUHN algorithm. | ||||||
|
||||||
https://en.wikipedia.org/wiki/Luhn_mod_N_algorithm | ||||||
:param alphabet: input alpha-numeric list of characters to determine mod 'N' | ||||||
:param input_str: the alpha numeric string to be checked for LUHN algorithm | ||||||
:return: True/False | ||||||
""" | ||||||
if len(alphabet) == 0: | ||||||
return False | ||||||
|
||||||
charset = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||||||
n = len(charset) | ||||||
luhn_input = tuple(alphabet.index(i) for i in reversed(str(input_str))) | ||||||
return ( | ||||||
sum(luhn_input[::2]) + sum(sum(divmod(i * 2, n)) for i in luhn_input[1::2]) | ||||||
) % n == 0 | ||||||
|
||||||
@staticmethod | ||||||
def is_verhoeff_number(input_number: int): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
""" | ||||||
Check if the input number is a true verhoeff number. | ||||||
|
||||||
:param input_number: | ||||||
:return: | ||||||
:return: Bool | ||||||
""" | ||||||
__d__ = [ | ||||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], | ||||||
|
@@ -73,3 +109,140 @@ def is_verhoeff_number(input_number: int): | |||||
for i in range(len(inverted_number)): | ||||||
c = __d__[c][__p__[i % 8][inverted_number[i]]] | ||||||
return __inv__[c] == 0 | ||||||
|
||||||
def __load_country_master__(self): | ||||||
""" | ||||||
Load various standards as defined in Country specific metadata. | ||||||
|
||||||
:return: None | ||||||
""" | ||||||
if os.path.isfile(self.__country_master_file_path__) is not True: | ||||||
raise FileNotFoundError() | ||||||
else: | ||||||
with open( | ||||||
file=self.__country_master_file_path__, | ||||||
devopam marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
mode="r", | ||||||
newline="", | ||||||
encoding="utf-8", | ||||||
) as csvfile: | ||||||
if csv.Sniffer().has_header(csvfile.readline()) is not True: | ||||||
raise Exception( | ||||||
"Header missing in file: {}".format( | ||||||
self.__country_master_file_path__ | ||||||
) | ||||||
) | ||||||
csvfile.seek(0) # read the header as well, hence start from beginning | ||||||
country_info = csv.DictReader(csvfile, fieldnames=None) | ||||||
self.__country_master__ = list(country_info) | ||||||
|
||||||
if len(self.__country_master__) <= 1: | ||||||
raise Exception( | ||||||
"Blank file: {} detected.".format(self.__country_master_file_path__) | ||||||
) | ||||||
|
||||||
def __get_country_master_full_data__(self, iso_code: str = ""): | ||||||
""" | ||||||
Fetch all country information for a specific column (index). | ||||||
|
||||||
:param iso_code: | ||||||
:return: | ||||||
""" | ||||||
supported_codes = [ | ||||||
"ISO3166-1-Alpha-2", | ||||||
"ISO3166-1-Alpha-3", | ||||||
"ISO3166-1-Numeric", | ||||||
"ISO4217-Alpha-3", | ||||||
"ISO4217-Numeric", | ||||||
] | ||||||
if iso_code.strip() not in supported_codes: | ||||||
return None | ||||||
else: | ||||||
# return full country list for given code | ||||||
country_information = [ | ||||||
country[iso_code] for country in self.__country_master__ | ||||||
] | ||||||
country_information = list(filter(None, country_information)) | ||||||
return country_information | ||||||
|
||||||
def get_country_codes(self, iso_code: str): | ||||||
""" | ||||||
Fetch all defined country codes per required ISO format. | ||||||
|
||||||
:param iso_code: currently supporting : ISO3166-1-Alpha-2, | ||||||
ISO3166-1-Alpha-3, ISO3166-1-Numeric | ||||||
:return: List of country codes in provided ISO format. | ||||||
""" | ||||||
supported_codes = [ | ||||||
"ISO3166-1-Alpha-2", | ||||||
"ISO3166-1-Alpha-3", | ||||||
"ISO3166-1-Numeric", | ||||||
] | ||||||
if iso_code.strip() not in supported_codes: | ||||||
print("Code Invalid: ") | ||||||
return None | ||||||
else: | ||||||
# return full country list for given code | ||||||
return self.__get_country_master_full_data__(iso_code=iso_code) | ||||||
|
||||||
def get_currency_codes(self, iso_code: str = ""): | ||||||
""" | ||||||
...x .c ,xcRetrieve all defined currency codes across countries. | ||||||
|
||||||
:param iso_code: currently supporting : ISO4217-Alpha-3, ISO4217-Numeric | ||||||
:return: List of currency codes in provided ISO format. | ||||||
""" | ||||||
supported_codes = ["ISO4217-Alpha-3", "ISO4217-Numeric"] | ||||||
if iso_code.strip() not in supported_codes: | ||||||
return None | ||||||
else: | ||||||
# return full country list for given code | ||||||
return self.__get_country_master_full_data__(iso_code=iso_code) | ||||||
|
||||||
def get_full_country_information(self, lookup_key: str, lookup_index: str): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please define return type ( |
||||||
""" | ||||||
Fetch additional information through lookup_index in index of lookup_key. | ||||||
|
||||||
:param lookup_key: Item to be searched | ||||||
:param lookup_index: A valid index_name out of available values | ||||||
English_short_name_using_title_case, English_full_name, | ||||||
FIFA_country_code, International_olympic_committee_country_code, | ||||||
ISO3166-1-Alpha-2,ISO3166-1-Alpha-3, ISO3166-1-Numeric, | ||||||
International_licence_plate_country_code, Country_code_top_level_domain, | ||||||
Currency_Name, ISO4217-Alpha-3, ISO4217-Numeric, Capital_City, Dialing_Code | ||||||
:return: Dictionary object with additional information enriched from | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It says it returns a dictionary, but it looks like the code returns a |
||||||
master lookup | ||||||
|
||||||
""" | ||||||
allowed_indices = [ | ||||||
"English_short_name_using_title_case", | ||||||
"English_full_name", | ||||||
"FIFA_country_code", | ||||||
"International_olympic_committee_country_code", | ||||||
"ISO3166-1-Alpha-2", | ||||||
"ISO3166-1-Alpha-3", | ||||||
"ISO3166-1-Numeric", | ||||||
"International_licence_plate_country_code", | ||||||
"Country_code_top_level_domain", | ||||||
"Currency_Name", | ||||||
"ISO4217-Alpha-3", | ||||||
"ISO4217-Numeric", | ||||||
"Capital_City", | ||||||
"Dialing_Code", | ||||||
] | ||||||
if ( | ||||||
lookup_index is None | ||||||
or len(lookup_index.strip()) == 0 | ||||||
or lookup_index not in allowed_indices | ||||||
): | ||||||
print("Lookup Index problem") | ||||||
return None | ||||||
elif lookup_key is None or len(lookup_key.strip()) == 0: | ||||||
print("Lookup Key issue") | ||||||
return None | ||||||
else: | ||||||
return list( | ||||||
filter( | ||||||
lambda country: country[lookup_index] == lookup_key, | ||||||
self.__country_master__, | ||||||
) | ||||||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.